Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add exponential backoff to orbit enroll retries #17368

Merged
merged 3 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions Dockerfile-desktop-linux
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
FROM --platform=linux/amd64 golang:1.21.7-bullseye@sha256:447afe790df28e0bc19d782a9f776a105ce3b8417cdd21f33affc4ed6d38f9d5
LABEL maintainer="Fleet Developers"

RUN apt-get update && apt-get install -y \
gcc \
libgtk-3-dev \
libayatana-appindicator3-dev \
&& rm -rf /var/lib/apt/lists/*
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not needed anymore because we've recently replaced https://github.com/getlantern/systray with a fork https://github.com/fyne-io/systray (which doesn't have any GTK dependencies on Linux).


RUN mkdir -p /usr/src/fleet
RUN mkdir -p /output

Expand Down
1 change: 1 addition & 0 deletions orbit/changes/16594-orbit-enroll-backoff
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Add exponential backoff to orbit enroll retries.
11 changes: 7 additions & 4 deletions orbit/pkg/constant/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ const (
DesktopAppExecName = "fleet-desktop"
// OrbitNodeKeyFileName is the filename on disk where we write the orbit node key to
OrbitNodeKeyFileName = "secret-orbit-node-key.txt"
// OrbitEnrollMaxRetries is the max retries when doing an enroll request
OrbitEnrollMaxRetries = 3
// OrbitEnrollRetrySleep is the time duration to sleep between retries
OrbitEnrollRetrySleep = 5 * time.Second
// OrbitEnrollMaxRetries is the max number of retries when doing an enroll request.
// We set it to 6 to allow the retry backoff to take effect.
OrbitEnrollMaxRetries = 6
// OrbitEnrollBackoffMultiplier is the multiplier to use for backing off between enroll retries.
OrbitEnrollBackoffMultiplier = 2
// OrbitEnrollRetrySleep is the duration to sleep between enroll retries.
OrbitEnrollRetrySleep = 10 * time.Second
lucasmrod marked this conversation as resolved.
Show resolved Hide resolved
// OsquerydName is the name of osqueryd binary
// We use osqueryd as name to properly identify the process when listing
// running processes/tasks.
Expand Down
27 changes: 21 additions & 6 deletions pkg/retry/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,26 @@ import (
)

type config struct {
interval time.Duration
maxAttempts int
initialInterval time.Duration
backoffMultiplier int
maxAttempts int
}

// Option allows to configure the behavior of retry.Do
type Option func(*config)

// WithRetryInterval allows to specify a custom duration to wait
// WithInterval allows to specify a custom duration to wait
// between retries.
func WithInterval(i time.Duration) Option {
return func(c *config) {
c.interval = i
c.initialInterval = i
}
}

// WithBackoffMultiplier allows to specify the backoff multiplier between retries.
func WithBackoffMultiplier(m int) Option {
return func(c *config) {
c.backoffMultiplier = m
}
}

Expand All @@ -37,16 +45,17 @@ func WithMaxAttempts(a int) Option {
// seconds
func Do(fn func() error, opts ...Option) error {
cfg := &config{
interval: 30 * time.Second,
initialInterval: 30 * time.Second,
}
for _, opt := range opts {
opt(cfg)
}

attempts := 0
ticker := time.NewTicker(cfg.interval)
ticker := time.NewTicker(cfg.initialInterval)
defer ticker.Stop()

backoff := 1
for {
attempts++
err := fn()
Expand All @@ -58,6 +67,12 @@ func Do(fn func() error, opts ...Option) error {
return err
}

if cfg.backoffMultiplier != 0 {
interval := time.Duration(backoff) * cfg.initialInterval
backoff *= cfg.backoffMultiplier
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Looks strange that you're setting backoff after using it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How's so? I'm updating it's value for the next iteration.

ticker.Reset(interval)
}

<-ticker.C
}
}
30 changes: 30 additions & 0 deletions pkg/retry/retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,34 @@ func TestRetryDo(t *testing.T) {
require.NoError(t, err)
require.Equal(t, max, count)
})

t.Run("with backoff", func(t *testing.T) {
count := 0
max := 4
start := time.Now()
err := Do(func() error {
switch count {
case 0:
require.WithinDuration(t, start, time.Now(), 1*time.Millisecond)
case 1:
require.WithinDuration(t, start.Add(50*time.Millisecond), time.Now(), 10*time.Millisecond)
case 2:
require.WithinDuration(t, start.Add((50+100)*time.Millisecond), time.Now(), 10*time.Millisecond)
case 3:
require.WithinDuration(t, start.Add((50+100+200)*time.Millisecond), time.Now(), 10*time.Millisecond)
}
count++
if count != max {
return errTest
}
return nil
},
WithInterval(50*time.Millisecond),
WithBackoffMultiplier(2),
WithMaxAttempts(4),
)

require.NoError(t, err)
require.Equal(t, max, count)
})
}
8 changes: 6 additions & 2 deletions server/service/orbit_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,12 @@ func (oc *OrbitClient) getNodeKeyOrEnroll() (string, error) {
return err
}
},
retry.WithInterval(OrbitRetryInterval()),
// The below configuration means the following retry intervals (exponential backoff):
// 10s, 20s, 40s, 80s, 160s and then return the failure (max attempts = 6)
// thus executing no more than ~6 enroll request failures every ~5 minutes.
retry.WithInterval(orbitEnrollRetryInterval()),
retry.WithMaxAttempts(constant.OrbitEnrollMaxRetries),
retry.WithBackoffMultiplier(constant.OrbitEnrollBackoffMultiplier),
); err != nil {
return "", fmt.Errorf("orbit node key enroll failed, attempts=%d", constant.OrbitEnrollMaxRetries)
}
Expand Down Expand Up @@ -402,7 +406,7 @@ func (oc *OrbitClient) setLastRecordedError(err error) {
oc.lastRecordedErr = fmt.Errorf("%s: %w", time.Now().UTC().Format("2006-01-02T15:04:05Z"), err)
}

func OrbitRetryInterval() time.Duration {
func orbitEnrollRetryInterval() time.Duration {
interval := os.Getenv("FLEETD_ENROLL_RETRY_INTERVAL")
if interval != "" {
d, err := time.ParseDuration(interval)
Expand Down
4 changes: 3 additions & 1 deletion tools/tuf/test/create_repository.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ for system in $SYSTEMS; do
# compiling a macOS-arm64 binary requires CGO and a macOS computer (for
# Apple keychain, some tables, etc), if this is the case, compile an
# universal binary.
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ]; then
#
# NOTE(lucas): Cross-compiling orbit for arm64 from Intel macOS currently fails (CGO error).
if [ $system == "macos" ] && [ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ]; then
CGO_ENABLED=1 \
CODESIGN_IDENTITY=$CODESIGN_IDENTITY \
ORBIT_VERSION=42 \
Expand Down
Loading