Skip to content

Commit

Permalink
Do not retry job when unknown status (#1335)
Browse files Browse the repository at this point in the history
When the gRPC stream is broken due to any network glitch, ExecutionDone is called with a failed state, this triggers the retry call, but the real status of the task is unknown.

This change disables retry when the reason for ExecutionDone is ErrBrokenStream.
  • Loading branch information
Victor Castell committed May 27, 2023
1 parent e04a1f8 commit 51dc5f3
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
7 changes: 6 additions & 1 deletion dkron/grpc.go
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"net"
"strings"
"time"

metrics "github.com/armon/go-metrics"
Expand Down Expand Up @@ -205,8 +206,12 @@ func (grpcs *GRPCServer) ExecutionDone(ctx context.Context, execDoneReq *proto.E
}

// If the execution failed, retry it until retries limit (default: don't retry)
// Don't retry if the status is unknown
execution := NewExecutionFromProto(&pbex)
if !execution.Success && uint(execution.Attempt) < job.Retries+1 {
if !execution.Success &&
uint(execution.Attempt) < job.Retries+1 &&
!strings.HasPrefix(execution.Output, ErrBrokenStream.Error()) {
// Increment the attempt counter
execution.Attempt++

// Keep all execution properties intact except the last output
Expand Down
2 changes: 1 addition & 1 deletion dkron/grpc_client.go
Expand Up @@ -426,7 +426,7 @@ func (grpcc *GRPCClient) AgentRun(addr string, job *proto.Job, execution *proto.
if err != nil {
// At this point the execution status will be unknown, set the FinishedAt time and an explanatory message
execution.FinishedAt = ptypes.TimestampNow()
execution.Output = []byte(err.Error())
execution.Output = []byte(ErrBrokenStream.Error() + ": " + err.Error())

grpcc.logger.WithError(err).Error(ErrBrokenStream)

Expand Down

0 comments on commit 51dc5f3

Please sign in to comment.