Skip to content

Commit

Permalink
etcd: Fix incorrect context usage in session renewal
Browse files Browse the repository at this point in the history
The context passed into NewSession() was supposed to enforce a timeout on the
NewSession() operation which is triggering a Grant() and KeepAlive()
instruction. However, the context passed into NewSession() will also be
associated with the resulting lease. As per etcd documentation, this will
result in:

> If the context is canceled before Close() completes, the session's lease will
> be abandoned and left to expire instead of being revoked.

Because of this, any session renewal triggering a new session would create a
session that is immediately closed again due to the context passed into
NewSession being cancelled when the controller run ends successfully. This
resulted in any renewed session to have an effective lifetime of 10
milliseconds before requiring renewal again.

Fixes: #12619
Fixes: 8524fca ("kvstore: Add session renew backoff")

Signed-off-by: Thomas Graf <thomas@cilium.io>
  • Loading branch information
tgraf authored and rolinh committed Jul 23, 2020
1 parent 02a1810 commit 0262854
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 8 deletions.
43 changes: 43 additions & 0 deletions pkg/contexthelpers/context.go
@@ -0,0 +1,43 @@
// Copyright 2020 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package contexthelpers

import (
"context"
"time"
)

type SuccessChan chan bool

// NewConditionalTimeoutContext returns a context which is cancelled when
// success is not reported within the specified timeout
func NewConditionalTimeoutContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc, SuccessChan) {
ch := make(SuccessChan)
c, cancel := context.WithCancel(ctx)

go func() {
select {
case success := <-ch:
if !success {
cancel()
return
}
case <-time.After(timeout):
cancel()
}
}()

return c, cancel, ch
}
63 changes: 63 additions & 0 deletions pkg/contexthelpers/context_test.go
@@ -0,0 +1,63 @@
// Copyright 2020 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build !privileged_tests

package contexthelpers

import (
"context"
"testing"
"time"

"gopkg.in/check.v1"
)

func Test(t *testing.T) {
check.TestingT(t)
}

type ContextSuite struct{}

var _ = check.Suite(&ContextSuite{})

func (b *ContextSuite) TestConditionalTimeoutContext(c *check.C) {
ctx, cancel, ch := NewConditionalTimeoutContext(context.Background(), 10*time.Millisecond)
c.Assert(ctx, check.Not(check.IsNil))
c.Assert(cancel, check.Not(check.IsNil))
c.Assert(ch, check.Not(check.IsNil))

// validate that the context is being cancelled due to the 10
// millisecond timeout specified
select {
case <-ctx.Done():
case <-time.After(time.Second):
c.Errorf("conditional timeout was not triggered")
}

ctx, cancel, ch = NewConditionalTimeoutContext(context.Background(), 10*time.Millisecond)
// report success via the channel
ch <- true
close(ch)

// validate that the context is not being cancelled as success has been
// reported
select {
case <-ctx.Done():
c.Errorf("context cancelled despite reporting success")
case <-time.After(100 * time.Millisecond):
}

cancel()
}
51 changes: 43 additions & 8 deletions pkg/kvstore/etcd.go
Expand Up @@ -27,6 +27,7 @@ import (
"strings"
"time"

"github.com/cilium/cilium/pkg/contexthelpers"
"github.com/cilium/cilium/pkg/controller"
"github.com/cilium/cilium/pkg/defaults"
"github.com/cilium/cilium/pkg/lock"
Expand Down Expand Up @@ -269,8 +270,12 @@ type etcdClient struct {

// protects sessions from concurrent access
lock.RWMutex
session *concurrency.Session
lockSession *concurrency.Session

session *concurrency.Session
sessionCancel context.CancelFunc

lockSession *concurrency.Session
lockSessionCancel context.CancelFunc

// statusLock protects latestStatusSnapshot and latestErrorStatus for
// read/write access
Expand Down Expand Up @@ -520,21 +525,36 @@ func (e *etcdClient) renewSession(ctx context.Context) error {
// routines can get a lease ID of an already expired lease.
e.Lock()

timeoutCtx, cancel := context.WithTimeout(ctx, statusCheckTimeout)
defer cancel()
// Cancel any eventual old session context
if e.sessionCancel != nil {
e.sessionCancel()
e.sessionCancel = nil
}

// Create a context representing the lifetime of the session. It will
// timeout if the session creation does not succeed in time and then
// persists until any of the below conditions are met:
// - The parent context is cancelled due to the etcd client closing or
// the controller being shut down
// - The above call to sessionCancel() cancels the session due to the
// session ending and requiring renewal.
sessionContext, sessionCancel, sessionSuccess := contexthelpers.NewConditionalTimeoutContext(ctx, statusCheckTimeout)
defer close(sessionSuccess)

newSession, err := concurrency.NewSession(
e.client,
concurrency.WithTTL(int(option.Config.KVstoreLeaseTTL.Seconds())),
concurrency.WithContext(timeoutCtx),
concurrency.WithContext(sessionContext),
)
if err != nil {
e.UnlockIgnoreTime()
return fmt.Errorf("unable to renew etcd session: %s", err)
}
sessionSuccess <- true
log.Infof("Got new lease ID %x", newSession.Lease())

e.session = newSession
e.sessionCancel = sessionCancel
e.UnlockIgnoreTime()

e.getLogger().WithField(fieldSession, newSession).Debug("Renewing etcd session")
Expand Down Expand Up @@ -568,20 +588,35 @@ func (e *etcdClient) renewLockSession(ctx context.Context) error {
// routines can get a lease ID of an already expired lease.
e.Lock()

timeoutCtx, cancel := context.WithTimeout(ctx, statusCheckTimeout)
defer cancel()
if e.lockSessionCancel != nil {
e.lockSessionCancel()
e.lockSessionCancel = nil
}

// Create a context representing the lifetime of the lock session. It
// will timeout if the session creation does not succeed in time and
// persists until any of the below conditions are met:
// - The parent context is cancelled due to the etcd client closing or
// the controller being shut down
// - The above call to sessionCancel() cancels the session due to the
// session ending and requiring renewal.
sessionContext, sessionCancel, sessionSuccess := contexthelpers.NewConditionalTimeoutContext(ctx, statusCheckTimeout)
defer close(sessionSuccess)

newSession, err := concurrency.NewSession(
e.client,
concurrency.WithTTL(int(defaults.LockLeaseTTL.Seconds())),
concurrency.WithContext(timeoutCtx),
concurrency.WithContext(sessionContext),
)
if err != nil {
e.UnlockIgnoreTime()
return fmt.Errorf("unable to renew etcd lock session: %s", err)
}
sessionSuccess <- true
log.Infof("Got new lock lease ID %x", newSession.Lease())

e.lockSession = newSession
e.lockSessionCancel = sessionCancel
e.UnlockIgnoreTime()

e.getLogger().WithField(fieldSession, newSession).Debug("Renewing etcd lock session")
Expand Down

0 comments on commit 0262854

Please sign in to comment.