Skip to content

Commit

Permalink
[House keeping] include container statuses for all container exit err…
Browse files Browse the repository at this point in the history
…ors (#5161)

* include container statuses for all container exit errors

Signed-off-by: Paul Dittamo <pvdittamo@gmail.com>

* add unit test

Signed-off-by: Paul Dittamo <pvdittamo@gmail.com>

---------

Signed-off-by: Paul Dittamo <pvdittamo@gmail.com>
  • Loading branch information
pvditt committed Apr 4, 2024
1 parent f1c2231 commit f8d4992
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
9 changes: 8 additions & 1 deletion flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -851,9 +851,11 @@ func DemystifyFailure(status v1.PodStatus, info pluginsCore.TaskInfo) (pluginsCo
// }
// }
//

var isSystemError bool
// In some versions of GKE the reason can also be "Terminated"
if code == "Shutdown" || code == "Terminated" {
return pluginsCore.PhaseInfoSystemRetryableFailure(Interrupted, message, &info), nil
isSystemError = true
}

//
Expand Down Expand Up @@ -887,6 +889,11 @@ func DemystifyFailure(status v1.PodStatus, info pluginsCore.TaskInfo) (pluginsCo
}
}
}

if isSystemError {
return pluginsCore.PhaseInfoSystemRetryableFailure(Interrupted, message, &info), nil
}

return pluginsCore.PhaseInfoRetryableFailure(code, message, &info), nil
}

Expand Down
24 changes: 24 additions & 0 deletions flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1631,25 +1631,49 @@ func TestDemystifyFailure(t *testing.T) {
})

t.Run("GKE kubelet graceful node shutdown", func(t *testing.T) {
containerReason := "some reason"
phaseInfo, err := DemystifyFailure(v1.PodStatus{
Message: "Pod Node is in progress of shutting down, not admitting any new pods",
Reason: "Shutdown",
ContainerStatuses: []v1.ContainerStatus{
{
LastTerminationState: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
Reason: containerReason,
ExitCode: SIGKILL,
},
},
},
},
}, pluginsCore.TaskInfo{})
assert.Nil(t, err)
assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase())
assert.Equal(t, "Interrupted", phaseInfo.Err().Code)
assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().Kind)
assert.Contains(t, phaseInfo.Err().Message, containerReason)
})

t.Run("GKE kubelet graceful node shutdown", func(t *testing.T) {
containerReason := "some reason"
phaseInfo, err := DemystifyFailure(v1.PodStatus{
Message: "Foobar",
Reason: "Terminated",
ContainerStatuses: []v1.ContainerStatus{
{
LastTerminationState: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
Reason: containerReason,
ExitCode: SIGKILL,
},
},
},
},
}, pluginsCore.TaskInfo{})
assert.Nil(t, err)
assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase())
assert.Equal(t, "Interrupted", phaseInfo.Err().Code)
assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().Kind)
assert.Contains(t, phaseInfo.Err().Message, containerReason)
})
}

Expand Down

0 comments on commit f8d4992

Please sign in to comment.