From 02628547e06fde912a8fafb0a71d61cddc72dae3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 22 Jul 2020 15:41:54 +0200 Subject: [PATCH] etcd: Fix incorrect context usage in session renewal The context passed into NewSession() was supposed to enforce a timeout on the NewSession() operation which is triggering a Grant() and KeepAlive() instruction. However, the context passed into NewSession() will also be associated with the resulting lease. As per etcd documentation, this will result in: > If the context is canceled before Close() completes, the session's lease will > be abandoned and left to expire instead of being revoked. Because of this, any session renewal triggering a new session would create a session that is immediately closed again due to the context passed into NewSession being cancelled when the controller run ends successfully. This resulted in any renewed session to have an effective lifetime of 10 milliseconds before requiring renewal again. Fixes: #12619 Fixes: 8524fca879b ("kvstore: Add session renew backoff") Signed-off-by: Thomas Graf --- pkg/contexthelpers/context.go | 43 ++++++++++++++++++++ pkg/contexthelpers/context_test.go | 63 ++++++++++++++++++++++++++++++ pkg/kvstore/etcd.go | 51 ++++++++++++++++++++---- 3 files changed, 149 insertions(+), 8 deletions(-) create mode 100644 pkg/contexthelpers/context.go create mode 100644 pkg/contexthelpers/context_test.go diff --git a/pkg/contexthelpers/context.go b/pkg/contexthelpers/context.go new file mode 100644 index 000000000000..202a778b1fd9 --- /dev/null +++ b/pkg/contexthelpers/context.go @@ -0,0 +1,43 @@ +// Copyright 2020 Authors of Cilium +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package contexthelpers + +import ( + "context" + "time" +) + +type SuccessChan chan bool + +// NewConditionalTimeoutContext returns a context which is cancelled when +// success is not reported within the specified timeout +func NewConditionalTimeoutContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc, SuccessChan) { + ch := make(SuccessChan) + c, cancel := context.WithCancel(ctx) + + go func() { + select { + case success := <-ch: + if !success { + cancel() + return + } + case <-time.After(timeout): + cancel() + } + }() + + return c, cancel, ch +} diff --git a/pkg/contexthelpers/context_test.go b/pkg/contexthelpers/context_test.go new file mode 100644 index 000000000000..b931456c03c4 --- /dev/null +++ b/pkg/contexthelpers/context_test.go @@ -0,0 +1,63 @@ +// Copyright 2020 Authors of Cilium +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !privileged_tests + +package contexthelpers + +import ( + "context" + "testing" + "time" + + "gopkg.in/check.v1" +) + +func Test(t *testing.T) { + check.TestingT(t) +} + +type ContextSuite struct{} + +var _ = check.Suite(&ContextSuite{}) + +func (b *ContextSuite) TestConditionalTimeoutContext(c *check.C) { + ctx, cancel, ch := NewConditionalTimeoutContext(context.Background(), 10*time.Millisecond) + c.Assert(ctx, check.Not(check.IsNil)) + c.Assert(cancel, check.Not(check.IsNil)) + c.Assert(ch, check.Not(check.IsNil)) + + // validate that the context is being cancelled due to the 10 + // millisecond timeout specified + select { + case <-ctx.Done(): + case <-time.After(time.Second): + c.Errorf("conditional timeout was not triggered") + } + + ctx, cancel, ch = NewConditionalTimeoutContext(context.Background(), 10*time.Millisecond) + // report success via the channel + ch <- true + close(ch) + + // validate that the context is not being cancelled as success has been + // reported + select { + case <-ctx.Done(): + c.Errorf("context cancelled despite reporting success") + case <-time.After(100 * time.Millisecond): + } + + cancel() +} diff --git a/pkg/kvstore/etcd.go b/pkg/kvstore/etcd.go index bfd4d8dab555..780e35e74b1a 100644 --- a/pkg/kvstore/etcd.go +++ b/pkg/kvstore/etcd.go @@ -27,6 +27,7 @@ import ( "strings" "time" + "github.com/cilium/cilium/pkg/contexthelpers" "github.com/cilium/cilium/pkg/controller" "github.com/cilium/cilium/pkg/defaults" "github.com/cilium/cilium/pkg/lock" @@ -269,8 +270,12 @@ type etcdClient struct { // protects sessions from concurrent access lock.RWMutex - session *concurrency.Session - lockSession *concurrency.Session + + session *concurrency.Session + sessionCancel context.CancelFunc + + lockSession *concurrency.Session + lockSessionCancel context.CancelFunc // statusLock protects latestStatusSnapshot and latestErrorStatus for // read/write access @@ -520,21 +525,36 @@ func (e *etcdClient) renewSession(ctx context.Context) error { // routines can get a lease ID of an already expired lease. e.Lock() - timeoutCtx, cancel := context.WithTimeout(ctx, statusCheckTimeout) - defer cancel() + // Cancel any eventual old session context + if e.sessionCancel != nil { + e.sessionCancel() + e.sessionCancel = nil + } + + // Create a context representing the lifetime of the session. It will + // timeout if the session creation does not succeed in time and then + // persists until any of the below conditions are met: + // - The parent context is cancelled due to the etcd client closing or + // the controller being shut down + // - The above call to sessionCancel() cancels the session due to the + // session ending and requiring renewal. + sessionContext, sessionCancel, sessionSuccess := contexthelpers.NewConditionalTimeoutContext(ctx, statusCheckTimeout) + defer close(sessionSuccess) newSession, err := concurrency.NewSession( e.client, concurrency.WithTTL(int(option.Config.KVstoreLeaseTTL.Seconds())), - concurrency.WithContext(timeoutCtx), + concurrency.WithContext(sessionContext), ) if err != nil { e.UnlockIgnoreTime() return fmt.Errorf("unable to renew etcd session: %s", err) } + sessionSuccess <- true log.Infof("Got new lease ID %x", newSession.Lease()) e.session = newSession + e.sessionCancel = sessionCancel e.UnlockIgnoreTime() e.getLogger().WithField(fieldSession, newSession).Debug("Renewing etcd session") @@ -568,20 +588,35 @@ func (e *etcdClient) renewLockSession(ctx context.Context) error { // routines can get a lease ID of an already expired lease. e.Lock() - timeoutCtx, cancel := context.WithTimeout(ctx, statusCheckTimeout) - defer cancel() + if e.lockSessionCancel != nil { + e.lockSessionCancel() + e.lockSessionCancel = nil + } + + // Create a context representing the lifetime of the lock session. It + // will timeout if the session creation does not succeed in time and + // persists until any of the below conditions are met: + // - The parent context is cancelled due to the etcd client closing or + // the controller being shut down + // - The above call to sessionCancel() cancels the session due to the + // session ending and requiring renewal. + sessionContext, sessionCancel, sessionSuccess := contexthelpers.NewConditionalTimeoutContext(ctx, statusCheckTimeout) + defer close(sessionSuccess) + newSession, err := concurrency.NewSession( e.client, concurrency.WithTTL(int(defaults.LockLeaseTTL.Seconds())), - concurrency.WithContext(timeoutCtx), + concurrency.WithContext(sessionContext), ) if err != nil { e.UnlockIgnoreTime() return fmt.Errorf("unable to renew etcd lock session: %s", err) } + sessionSuccess <- true log.Infof("Got new lock lease ID %x", newSession.Lease()) e.lockSession = newSession + e.lockSessionCancel = sessionCancel e.UnlockIgnoreTime() e.getLogger().WithField(fieldSession, newSession).Debug("Renewing etcd lock session")