Skip to content

Commit f9608d4

Browse files
authored
feat(bigquery): augment retry predicate (#5387)
* feat(bigquery): augment retry predicate https://google.aip.dev/194 guidance is that INTERNAL errors are considered non-retryable. This PR deviates from that slightly, allowing internalError to be retried for job insertion and polling cases, as the BigQuery backend has an expectation that such work will be retried. All other retries continue to use the same retry predicate.
1 parent adea7fc commit f9608d4

File tree

3 files changed

+23
-8
lines changed

3 files changed

+23
-8
lines changed

Diff for: bigquery/bigquery.go

+21-6
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ func (c *Client) insertJob(ctx context.Context, job *bq.Job, media io.Reader) (*
127127
// have to read the contents and keep it in memory, and that could be expensive.
128128
// TODO(jba): Look into retrying if media != nil.
129129
if job.JobReference != nil && media == nil {
130-
err = runWithRetry(ctx, invoke)
130+
// We deviate from default retries due to BigQuery wanting to retry structured internal job errors.
131+
err = runWithRetryExplicit(ctx, invoke, jobRetryReasons)
131132
} else {
132133
err = invoke()
133134
}
@@ -152,7 +153,7 @@ func (c *Client) runQuery(ctx context.Context, queryRequest *bq.QueryRequest) (*
152153
}
153154

154155
// We control request ID, so we can always runWithRetry.
155-
err = runWithRetry(ctx, invoke)
156+
err = runWithRetryExplicit(ctx, invoke, jobRetryReasons)
156157
if err != nil {
157158
return nil, err
158159
}
@@ -174,6 +175,10 @@ func unixMillisToTime(m int64) time.Time {
174175
// See the similar function in ../storage/invoke.go. The main difference is the
175176
// reason for retrying.
176177
func runWithRetry(ctx context.Context, call func() error) error {
178+
return runWithRetryExplicit(ctx, call, defaultRetryReasons)
179+
}
180+
181+
func runWithRetryExplicit(ctx context.Context, call func() error, allowedReasons []string) error {
177182
// These parameters match the suggestions in https://cloud.google.com/bigquery/sla.
178183
backoff := gax.Backoff{
179184
Initial: 1 * time.Second,
@@ -185,15 +190,20 @@ func runWithRetry(ctx context.Context, call func() error) error {
185190
if err == nil {
186191
return true, nil
187192
}
188-
return !retryableError(err), err
193+
return !retryableError(err, allowedReasons), err
189194
})
190195
}
191196

197+
var (
198+
defaultRetryReasons = []string{"backendError", "rateLimitExceeded"}
199+
jobRetryReasons = []string{"backendError", "rateLimitExceeded", "internalError"}
200+
)
201+
192202
// This is the correct definition of retryable according to the BigQuery team. It
193203
// also considers 502 ("Bad Gateway") and 503 ("Service Unavailable") errors
194204
// retryable; these are returned by systems between the client and the BigQuery
195205
// service.
196-
func retryableError(err error) bool {
206+
func retryableError(err error, allowedReasons []string) bool {
197207
if err == nil {
198208
return false
199209
}
@@ -215,8 +225,13 @@ func retryableError(err error) bool {
215225
var reason string
216226
if len(e.Errors) > 0 {
217227
reason = e.Errors[0].Reason
228+
for _, r := range allowedReasons {
229+
if reason == r {
230+
return true
231+
}
232+
}
218233
}
219-
if e.Code == http.StatusServiceUnavailable || e.Code == http.StatusBadGateway || reason == "backendError" || reason == "rateLimitExceeded" {
234+
if e.Code == http.StatusServiceUnavailable || e.Code == http.StatusBadGateway {
220235
return true
221236
}
222237
case *url.Error:
@@ -233,7 +248,7 @@ func retryableError(err error) bool {
233248
}
234249
// Unwrap is only supported in go1.13.x+
235250
if e, ok := err.(interface{ Unwrap() error }); ok {
236-
return retryableError(e.Unwrap())
251+
return retryableError(e.Unwrap(), allowedReasons)
237252
}
238253
return false
239254
}

Diff for: bigquery/bigquery_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ func TestRetryableErrors(t *testing.T) {
119119
true,
120120
},
121121
} {
122-
got := retryableError(tc.in)
122+
got := retryableError(tc.in, defaultRetryReasons)
123123
if got != tc.want {
124124
t.Errorf("case (%s) mismatch: got %t want %t", tc.description, got, tc.want)
125125
}

Diff for: bigquery/job.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ func (j *Job) waitForQuery(ctx context.Context, projectID string) (Schema, uint6
345345
err := internal.Retry(ctx, backoff, func() (stop bool, err error) {
346346
res, err = call.Do()
347347
if err != nil {
348-
return !retryableError(err), err
348+
return !retryableError(err, jobRetryReasons), err
349349
}
350350
if !res.JobComplete { // GetQueryResults may return early without error; retry.
351351
return false, nil

0 commit comments

Comments
 (0)