diff --git a/cmd/argoexec/commands/emissary.go b/cmd/argoexec/commands/emissary.go index 1973c7c95e7f..ba1a5179524b 100644 --- a/cmd/argoexec/commands/emissary.go +++ b/cmd/argoexec/commands/emissary.go @@ -71,24 +71,42 @@ func NewEmissaryCommand() *cobra.Command { return fmt.Errorf("failed to unmarshal template: %w", err) } + // setup signal handlers + signals := make(chan os.Signal, 1) + defer close(signals) + signal.Notify(signals) + defer signal.Reset() + for _, x := range template.ContainerSet.GetGraph() { if x.Name == containerName { for _, y := range x.Dependencies { logger.Infof("waiting for dependency %q", y) + WaitForDependency: for { - data, err := ioutil.ReadFile(filepath.Clean(varRunArgo + "/ctr/" + y + "/exitcode")) - if os.IsNotExist(err) { - time.Sleep(time.Second) - continue - } - exitCode, err := strconv.Atoi(string(data)) - if err != nil { - return fmt.Errorf("failed to read exit-code of dependency %q: %w", y, err) - } - if exitCode != 0 { - return fmt.Errorf("dependency %q exited with non-zero code: %d", y, exitCode) + select { + // If we receive a terminated or killed signal, we should exit immediately. + case s := <-signals: + switch s { + case osspecific.Term: + // exit with 128 + 15 (SIGTERM) + return errors.NewExitErr(143) + case os.Kill: + // exit with 128 + 9 (SIGKILL) + return errors.NewExitErr(137) + } + default: + data, _ := os.ReadFile(filepath.Clean(varRunArgo + "/ctr/" + y + "/exitcode")) + exitCode, err := strconv.Atoi(string(data)) + if err != nil { + time.Sleep(time.Second) + continue + } + if exitCode != 0 { + return fmt.Errorf("dependency %q exited with non-zero code: %d", y, exitCode) + } + + break WaitForDependency } - break } } } @@ -118,11 +136,6 @@ func NewEmissaryCommand() *cobra.Command { } cmdErr := retry.OnError(backoff, func(error) bool { return true }, func() error { - // setup signal handlers - signals := make(chan os.Signal, 1) - defer close(signals) - signal.Notify(signals) - defer signal.Reset() command, closer, err := startCommand(name, args, template) if err != nil { diff --git a/test/e2e/signals_test.go b/test/e2e/signals_test.go index 2e116f12362a..db702f3d5f09 100644 --- a/test/e2e/signals_test.go +++ b/test/e2e/signals_test.go @@ -149,6 +149,29 @@ func (s *SignalsSuite) TestSignaled() { }) } +func (s *SignalsSuite) TestSignaledContainerSet() { + s.Given(). + Workflow("@testdata/signaled-container-set-workflow.yaml"). + When(). + SubmitWorkflow(). + WaitForWorkflow(). + Then(). + ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) { + assert.Equal(t, wfv1.WorkflowFailed, status.Phase) + assert.Equal(t, "OOMKilled (exit code 137)", status.Message) + one := status.Nodes.FindByDisplayName("one") + if assert.NotNil(t, one) { + assert.Equal(t, wfv1.NodeFailed, one.Phase) + assert.Equal(t, "OOMKilled (exit code 137): ", one.Message) + } + two := status.Nodes.FindByDisplayName("two") + if assert.NotNil(t, two) { + assert.Equal(t, wfv1.NodeFailed, two.Phase) + assert.Equal(t, "Error (exit code 143): ", two.Message) + } + }) +} + func TestSignalsSuite(t *testing.T) { suite.Run(t, new(SignalsSuite)) } diff --git a/test/e2e/testdata/signaled-container-set-workflow.yaml b/test/e2e/testdata/signaled-container-set-workflow.yaml new file mode 100644 index 000000000000..f1d8fcb6c2a6 --- /dev/null +++ b/test/e2e/testdata/signaled-container-set-workflow.yaml @@ -0,0 +1,50 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + name: example + namespace: argo +spec: + templates: + - name: entrypoint + metadata: + containerSet: + containers: + - name: one + resources: + requests: + memory: "50Mi" + cpu: "50m" + limits: + memory: "50Mi" + image: argoproj/argosay:v2 + command: + - bash + - '-c' + args: + - | + /bin/bash <<'EOF' + echo "hello one" + apt update -y + apt install stress -y + echo 'stress --vm 1 --vm-bytes 512M --vm-hang 100' > abc.sh + bash abc.sh + EOF + - name: two + resources: + requests: + memory: "150Mi" + cpu: "50m" + limits: + memory: "250Mi" + image: argoproj/argosay:v2 + command: + - bash + - '-c' + args: + - | + /bin/bash <<'EOF' + echo "hello world" + EOF + dependencies: + - one + entrypoint: entrypoint \ No newline at end of file diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index 6288879679f1..61cc84e2545c 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -1421,8 +1421,16 @@ func getExitCode(pod *apiv1.Pod) *int32 { } func podHasContainerNeedingTermination(pod *apiv1.Pod, tmpl wfv1.Template) bool { + // pod needs to be terminated if any of the following are true: + // 1. any main container has exited with non-zero exit code + // 2. all main containers have exited + // pod termination will cause the wait container to finish + for _, c := range pod.Status.ContainerStatuses { + if tmpl.IsMainContainerName(c.Name) && c.State.Terminated != nil && c.State.Terminated.ExitCode != 0 { + return true + } + } for _, c := range pod.Status.ContainerStatuses { - // Only clean up pod when all main containers are terminated if tmpl.IsMainContainerName(c.Name) && c.State.Terminated == nil { return false } diff --git a/workflow/executor/os-specific/signal_darwin.go b/workflow/executor/os-specific/signal_darwin.go index 7c5a7e7b6343..0e55a002cea6 100644 --- a/workflow/executor/os-specific/signal_darwin.go +++ b/workflow/executor/os-specific/signal_darwin.go @@ -8,6 +8,10 @@ import ( "github.com/argoproj/argo-workflows/v3/util/errors" ) +var ( + Term = syscall.SIGTERM +) + func CanIgnoreSignal(s os.Signal) bool { return s == syscall.SIGCHLD || s == syscall.SIGURG } diff --git a/workflow/executor/os-specific/signal_windows.go b/workflow/executor/os-specific/signal_windows.go index 4c3bed84e7e9..9eec8061c8d1 100644 --- a/workflow/executor/os-specific/signal_windows.go +++ b/workflow/executor/os-specific/signal_windows.go @@ -7,6 +7,10 @@ import ( "github.com/argoproj/argo-workflows/v3/util/errors" ) +var ( + Term = os.Interrupt +) + func CanIgnoreSignal(s os.Signal) bool { return false }