From d3bb6f688b4643155b4a9924cec726bdc76a1306 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Mon, 25 Mar 2024 23:10:35 +0800 Subject: [PATCH] *: LeaseTimeToLive returns error if leader changed The old leader demotes lessor and all the leases' expire time will be updated. Instead of returning incorrect remaining TTL, we should return errors to force client retry. Signed-off-by: Wei Fu --- server/etcdserver/v3_server.go | 12 +++++ server/lease/lease.go | 7 +++ server/lease/leasehttp/http.go | 11 +++++ tests/integration/v3_lease_test.go | 76 ++++++++++++++++++++++++++++++ tests/robustness/makefile.mk | 4 +- 5 files changed, 108 insertions(+), 2 deletions(-) diff --git a/server/etcdserver/v3_server.go b/server/etcdserver/v3_server.go index 30c0d506232..6c5eefcb31a 100644 --- a/server/etcdserver/v3_server.go +++ b/server/etcdserver/v3_server.go @@ -357,6 +357,9 @@ func (s *EtcdServer) leaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveR if err := s.waitAppliedIndex(); err != nil { return nil, err } + + // gofail: var beforeLookupWhenLeaseTimeToLive struct{} + // primary; timetolive directly from leader le := s.lessor.Lookup(lease.LeaseID(r.ID)) if le == nil { @@ -372,6 +375,15 @@ func (s *EtcdServer) leaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveR } resp.Keys = kbs } + + // The leasor could be demoted if leader changed during lookup. + // We should return error to force retry instead of returning + // incorrect remaining TTL. + if le.Demoted() { + // NOTE: lease.ErrNotPrimary is not retryable error for + // client. Instead, uses ErrLeaderChanged. + return nil, errors.ErrLeaderChanged + } return resp, nil } diff --git a/server/lease/lease.go b/server/lease/lease.go index b35a6efdc70..95f3eb6f756 100644 --- a/server/lease/lease.go +++ b/server/lease/lease.go @@ -95,6 +95,13 @@ func (l *Lease) forever() { l.expiry = forever } +// Demoted returns true if the lease's expiry has been reset to forever. +func (l *Lease) Demoted() bool { + l.expiryMu.Lock() + defer l.expiryMu.Unlock() + return l.expiry == forever +} + // Keys returns all the keys attached to the lease. func (l *Lease) Keys() []string { l.mu.RLock() diff --git a/server/lease/leasehttp/http.go b/server/lease/leasehttp/http.go index 7c9f56bde5c..9a337132a68 100644 --- a/server/lease/leasehttp/http.go +++ b/server/lease/leasehttp/http.go @@ -103,6 +103,9 @@ func (h *leaseHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { http.Error(w, ErrLeaseHTTPTimeout.Error(), http.StatusRequestTimeout) return } + + // gofail: var beforeLookupWhenForwardLeaseTimeToLive struct{} + l := h.l.Lookup(lease.LeaseID(lreq.LeaseTimeToLiveRequest.ID)) if l == nil { http.Error(w, lease.ErrLeaseNotFound.Error(), http.StatusNotFound) @@ -126,6 +129,14 @@ func (h *leaseHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { resp.LeaseTimeToLiveResponse.Keys = kbs } + // The leasor could be demoted if leader changed during lookup. + // We should return error to force retry instead of returning + // incorrect remaining TTL. + if l.Demoted() { + http.Error(w, lease.ErrNotPrimary.Error(), http.StatusInternalServerError) + return + } + v, err = resp.Marshal() if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) diff --git a/tests/integration/v3_lease_test.go b/tests/integration/v3_lease_test.go index 8e7b16e6981..d9d9a7423c9 100644 --- a/tests/integration/v3_lease_test.go +++ b/tests/integration/v3_lease_test.go @@ -22,6 +22,8 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "google.golang.org/grpc/codes" "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" @@ -30,8 +32,10 @@ import ( "go.etcd.io/etcd/api/v3/mvccpb" "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" "go.etcd.io/etcd/client/pkg/v3/testutil" + clientv3 "go.etcd.io/etcd/client/v3" framecfg "go.etcd.io/etcd/tests/v3/framework/config" "go.etcd.io/etcd/tests/v3/framework/integration" + gofail "go.etcd.io/gofail/runtime" ) // TestV3LeasePromote ensures the newly elected leader can promote itself @@ -1046,6 +1050,78 @@ func TestV3LeaseRecoverKeyWithMutipleLease(t *testing.T) { } } +func TestV3LeaseTimeToLiveWithLeaderChanged(t *testing.T) { + t.Run("normal", func(subT *testing.T) { + testV3LeaseTimeToLiveWithLeaderChanged(subT, "beforeLookupWhenLeaseTimeToLive") + }) + + t.Run("forward", func(subT *testing.T) { + testV3LeaseTimeToLiveWithLeaderChanged(subT, "beforeLookupWhenForwardLeaseTimeToLive") + }) +} + +func testV3LeaseTimeToLiveWithLeaderChanged(t *testing.T, fpName string) { + if len(gofail.List()) == 0 { + t.Skip("please run 'make gofail-enable' before running the test") + } + + integration.BeforeTest(t) + + clus := integration.NewCluster(t, &integration.ClusterConfig{Size: 3}) + defer clus.Terminate(t) + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + oldLeadIdx := clus.WaitLeader(t) + followerIdx := (oldLeadIdx + 1) % 3 + + followerMemberID := clus.Members[followerIdx].ID() + + oldLeadC := clus.Client(oldLeadIdx) + + leaseResp, err := oldLeadC.Grant(ctx, 100) + require.NoError(t, err) + + require.NoError(t, gofail.Enable(fpName, `sleep("3s")`)) + t.Cleanup(func() { + terr := gofail.Disable(fpName) + if terr != nil && terr != gofail.ErrDisabled { + t.Fatalf("failed to disable %s: %v", fpName, terr) + } + }) + + readyCh := make(chan struct{}) + errCh := make(chan error, 1) + + var targetC *clientv3.Client + switch fpName { + case "beforeLookupWhenLeaseTimeToLive": + targetC = oldLeadC + case "beforeLookupWhenForwardLeaseTimeToLive": + targetC = clus.Client((oldLeadIdx + 2) % 3) + default: + t.Fatalf("unsupported %s failpoint", fpName) + } + + go func() { + <-readyCh + time.Sleep(1 * time.Second) + + _, merr := oldLeadC.MoveLeader(ctx, uint64(followerMemberID)) + assert.NoError(t, gofail.Disable(fpName)) + errCh <- merr + }() + + close(readyCh) + + ttlResp, err := targetC.TimeToLive(ctx, leaseResp.ID) + require.NoError(t, err) + require.GreaterOrEqual(t, int64(100), ttlResp.TTL) + + require.NoError(t, <-errCh) +} + // acquireLeaseAndKey creates a new lease and creates an attached key. func acquireLeaseAndKey(clus *integration.Cluster, key string) (int64, error) { // create lease diff --git a/tests/robustness/makefile.mk b/tests/robustness/makefile.mk index dee968eb63a..8d50c03dea5 100644 --- a/tests/robustness/makefile.mk +++ b/tests/robustness/makefile.mk @@ -36,7 +36,7 @@ GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/g .PHONY: gofail-enable gofail-enable: install-gofail - gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ + gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION} cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION} cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION} @@ -44,7 +44,7 @@ gofail-enable: install-gofail .PHONY: gofail-disable gofail-disable: install-gofail - gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ + gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ cd ./server && go mod tidy cd ./etcdutl && go mod tidy cd ./etcdctl && go mod tidy