Skip to content

Commit 81c6039

Browse files
authored
feat(compute/metadata): retry error when talking to metadata service (#4648)
This was reported internally to be causing issues. This package is used by some of our auth flows so it should be good to make this package more resilient to transient failures. Implementation inspired by what we do for some of our http based services. Because this package is currently not context aware I needed to add attempts so retrying does not happen forever. Five attempts was arbitrarily chosen. Fixes: #4642 Release-As: 0.94.0
1 parent b31646d commit 81c6039

File tree

5 files changed

+278
-2
lines changed

5 files changed

+278
-2
lines changed

compute/metadata/metadata.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ import (
3232
"strings"
3333
"sync"
3434
"time"
35+
36+
"github.com/googleapis/gax-go/v2"
3537
)
3638

3739
const (
@@ -282,6 +284,7 @@ func NewClient(c *http.Client) *Client {
282284
// getETag returns a value from the metadata service as well as the associated ETag.
283285
// This func is otherwise equivalent to Get.
284286
func (c *Client) getETag(suffix string) (value, etag string, err error) {
287+
ctx := context.TODO()
285288
// Using a fixed IP makes it very difficult to spoof the metadata service in
286289
// a container, which is an important use-case for local testing of cloud
287290
// deployments. To enable spoofing of the metadata service, the environment
@@ -304,8 +307,20 @@ func (c *Client) getETag(suffix string) (value, etag string, err error) {
304307
}
305308
req.Header.Set("Metadata-Flavor", "Google")
306309
req.Header.Set("User-Agent", userAgent)
307-
res, err := c.hc.Do(req)
308-
if err != nil {
310+
var res *http.Response
311+
retryer := newRetryer()
312+
for {
313+
var err error
314+
res, err = c.hc.Do(req)
315+
if err == nil {
316+
break
317+
}
318+
if delay, shouldRetry := retryer.Retry(res.StatusCode, err); shouldRetry {
319+
if err := gax.Sleep(ctx, delay); err != nil {
320+
return "", "", err
321+
}
322+
continue
323+
}
309324
return "", "", err
310325
}
311326
defer res.Body.Close()

compute/metadata/retry.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Copyright 2021 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package metadata
16+
17+
import (
18+
"io"
19+
"time"
20+
21+
"github.com/googleapis/gax-go/v2"
22+
)
23+
24+
const (
25+
maxRetryAttempts = 5
26+
)
27+
28+
var (
29+
syscallRetryable = func(err error) bool { return false }
30+
)
31+
32+
func newRetryer() *metadataRetryer {
33+
return &metadataRetryer{bo: &gax.Backoff{Initial: 100 * time.Millisecond}}
34+
}
35+
36+
type backoff interface {
37+
Pause() time.Duration
38+
}
39+
40+
type metadataRetryer struct {
41+
bo backoff
42+
attempts int
43+
}
44+
45+
func (r *metadataRetryer) Retry(status int, err error) (time.Duration, bool) {
46+
retryOk := shouldRetry(status, err)
47+
if !retryOk {
48+
return 0, false
49+
}
50+
if r.attempts == maxRetryAttempts {
51+
return 0, false
52+
}
53+
r.attempts++
54+
return r.bo.Pause(), true
55+
}
56+
57+
func shouldRetry(status int, err error) bool {
58+
if 500 <= status && status <= 599 {
59+
return true
60+
}
61+
if err == io.ErrUnexpectedEOF {
62+
return true
63+
}
64+
// Transient network errors should be retried.
65+
if syscallRetryable(err) {
66+
return true
67+
}
68+
if err, ok := err.(interface{ Temporary() bool }); ok {
69+
if err.Temporary() {
70+
return true
71+
}
72+
}
73+
if err, ok := err.(interface{ Unwrap() error }); ok {
74+
return shouldRetry(status, err.Unwrap())
75+
}
76+
return false
77+
}

compute/metadata/retry_linux.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Copyright 2021 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// +build linux
16+
17+
package metadata
18+
19+
import "syscall"
20+
21+
func init() {
22+
// Initialize syscallRetryable to return true on transient socket-level
23+
// errors. These errors are specific to Linux.
24+
syscallRetryable = func(err error) bool { return err == syscall.ECONNRESET || err == syscall.ECONNREFUSED }
25+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright 2021 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// +build linux
16+
17+
package metadata
18+
19+
import (
20+
"syscall"
21+
"testing"
22+
)
23+
24+
func TestMetadataRetryerLinux(t *testing.T) {
25+
retryer := metadataRetryer{bo: constantBackoff{}}
26+
27+
t.Run("retry on syscall.ECONNRESET", func(t *testing.T) {
28+
_, shouldRetry := retryer.Retry(400, syscall.ECONNRESET)
29+
if !shouldRetry {
30+
t.Fatal("retryer.Retry(400, syscall.ECONNRESET) = false, want true")
31+
}
32+
})
33+
t.Run("retry on syscall.ECONNREFUSED", func(t *testing.T) {
34+
_, shouldRetry := retryer.Retry(400, syscall.ECONNREFUSED)
35+
if !shouldRetry {
36+
t.Fatal("retryer.Retry(400, syscall.ECONNREFUSED) = false, want true")
37+
}
38+
})
39+
}

compute/metadata/retry_test.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright 2021 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package metadata
16+
17+
import (
18+
"io"
19+
"testing"
20+
"time"
21+
)
22+
23+
type constantBackoff struct{}
24+
25+
func (b constantBackoff) Pause() time.Duration { return 100 }
26+
27+
type errTemp struct{}
28+
29+
func (e errTemp) Error() string { return "temporary error" }
30+
31+
func (e errTemp) Temporary() bool { return true }
32+
33+
type errWrapped struct {
34+
e error
35+
}
36+
37+
func (e errWrapped) Error() string { return "unwrap me to get more context" }
38+
39+
func (e errWrapped) Unwrap() error { return e.e }
40+
41+
func TestMetadataRetryer(t *testing.T) {
42+
tests := []struct {
43+
name string
44+
code int
45+
err error
46+
wantDelay time.Duration
47+
wantShouldRetry bool
48+
}{
49+
{
50+
name: "retry on 500",
51+
code: 500,
52+
wantDelay: 100,
53+
wantShouldRetry: true,
54+
},
55+
{
56+
name: "don't retry on 400",
57+
code: 400,
58+
err: io.EOF,
59+
wantDelay: 0,
60+
wantShouldRetry: false,
61+
},
62+
{
63+
name: "retry on io.ErrUnexpectedEOF",
64+
code: 400,
65+
err: io.ErrUnexpectedEOF,
66+
wantDelay: 100,
67+
wantShouldRetry: true,
68+
},
69+
{
70+
name: "retry on temporary error",
71+
code: 400,
72+
err: errTemp{},
73+
wantDelay: 100,
74+
wantShouldRetry: true,
75+
},
76+
{
77+
name: "retry on wrapped temporary error",
78+
code: 400,
79+
err: errWrapped{errTemp{}},
80+
wantDelay: 100,
81+
wantShouldRetry: true,
82+
},
83+
{
84+
name: "don't retry on wrapped io.EOF",
85+
code: 400,
86+
err: errWrapped{io.EOF},
87+
wantDelay: 0,
88+
wantShouldRetry: false,
89+
},
90+
}
91+
92+
for _, tc := range tests {
93+
t.Run(tc.name, func(t *testing.T) {
94+
retryer := metadataRetryer{bo: constantBackoff{}}
95+
delay, shouldRetry := retryer.Retry(tc.code, tc.err)
96+
if delay != tc.wantDelay {
97+
t.Fatalf("retryer.Retry(%v, %v) = %v, want %v", tc.code, tc.err, delay, tc.wantDelay)
98+
}
99+
if shouldRetry != tc.wantShouldRetry {
100+
t.Fatalf("retryer.Retry(%v, %v) = %v, want %v", tc.code, tc.err, shouldRetry, tc.wantShouldRetry)
101+
}
102+
})
103+
}
104+
}
105+
106+
func TestMetadataRetryerAttempts(t *testing.T) {
107+
retryer := metadataRetryer{bo: constantBackoff{}}
108+
for i := 1; i <= 6; i++ {
109+
_, shouldRetry := retryer.Retry(500, nil)
110+
if i == 6 {
111+
if shouldRetry {
112+
t.Fatal("an error should only be retried 5 times")
113+
}
114+
break
115+
}
116+
if !shouldRetry {
117+
t.Fatalf("retryer.Retry(500, nil) = false, want true")
118+
}
119+
}
120+
}

0 commit comments

Comments
 (0)