From 6590ed9ea9521c03baded1e0e3ed12cc4c196f27 Mon Sep 17 00:00:00 2001
From: gibsondan <daniel@elementl.com>
Date: Thu, 5 Jan 2023 10:58:53 -0600
Subject: [PATCH] Fix maxResumeRunAttempts being explicitly enabled even if you
 set it to 0 (#11511)

Summary:
This fix makes maxResumeRunAttempts equal to 0 when it is explicitly set
to 0.

I feel a little weird about setting this experimental feature on by
default when you turn on run monitoring in general, but changign that
would be a breaking change that can probably wait until 1.2 (where I
think we should turn run monitoring on by default but this feature off
by default until we have more confidence in it)

### Summary & Motivation

### How I Tested These Changes
---
 .../schema_tests/test_dagster_daemon.py       | 50 +++++++++++++++++++
 .../dagster/templates/configmap-instance.yaml |  2 +-
 helm/dagster/values.yaml                      | 13 ++---
 3 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/helm/dagster/schema/schema_tests/test_dagster_daemon.py b/helm/dagster/schema/schema_tests/test_dagster_daemon.py
index 777a03f61b77..22d584afd02e 100644
--- a/helm/dagster/schema/schema_tests/test_dagster_daemon.py
+++ b/helm/dagster/schema/schema_tests/test_dagster_daemon.py
@@ -207,6 +207,20 @@ def test_queued_run_coordinator_unique_values(
     ]
 
 
+def test_run_monitoring_defaults(
+    instance_template: HelmTemplate,
+):  # pylint: disable=redefined-outer-name
+    helm_values = DagsterHelmValues.construct()
+
+    configmaps = instance_template.render(helm_values)
+
+    assert len(configmaps) == 1
+
+    instance = yaml.full_load(configmaps[0].data["dagster.yaml"])
+
+    assert "run_monitoring" not in instance
+
+
 def test_run_monitoring(
     instance_template: HelmTemplate,
 ):  # pylint: disable=redefined-outer-name
@@ -222,6 +236,42 @@ def test_run_monitoring(
 
     assert instance["run_monitoring"]["enabled"] is True
 
+    assert not "max_resume_run_attempts" in instance["run_monitoring"]
+
+
+def test_run_monitoring_no_max_resume_run_attempts(
+    instance_template: HelmTemplate,
+):  # pylint: disable=redefined-outer-name
+    helm_values = DagsterHelmValues.construct(
+        dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 0})
+    )
+
+    configmaps = instance_template.render(helm_values)
+
+    assert len(configmaps) == 1
+
+    instance = yaml.full_load(configmaps[0].data["dagster.yaml"])
+
+    assert instance["run_monitoring"]["enabled"] is True
+    assert instance["run_monitoring"]["max_resume_run_attempts"] == 0
+
+
+def test_run_monitoring_set_max_resume_run_attempts(
+    instance_template: HelmTemplate,
+):  # pylint: disable=redefined-outer-name
+    helm_values = DagsterHelmValues.construct(
+        dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 2})
+    )
+
+    configmaps = instance_template.render(helm_values)
+
+    assert len(configmaps) == 1
+
+    instance = yaml.full_load(configmaps[0].data["dagster.yaml"])
+
+    assert instance["run_monitoring"]["enabled"] is True
+    assert instance["run_monitoring"]["max_resume_run_attempts"] == 2
+
 
 def test_run_retries(
     instance_template: HelmTemplate,
diff --git a/helm/dagster/templates/configmap-instance.yaml b/helm/dagster/templates/configmap-instance.yaml
index 931f5a336417..4cc248fbeea1 100644
--- a/helm/dagster/templates/configmap-instance.yaml
+++ b/helm/dagster/templates/configmap-instance.yaml
@@ -90,7 +90,7 @@ data:
     run_monitoring:
       enabled: {{ $runMonitoring.enabled }}
       start_timeout_seconds:  {{ $runMonitoring.startTimeoutSeconds }}
-      {{- if $runMonitoring.maxResumeRunAttempts }}
+      {{- if not (kindIs "invalid" $runMonitoring.maxResumeRunAttempts) }}
       max_resume_run_attempts: {{ $runMonitoring.maxResumeRunAttempts }}
       {{- end }}
       poll_interval_seconds: {{ $runMonitoring.pollIntervalSeconds }}
diff --git a/helm/dagster/values.yaml b/helm/dagster/values.yaml
index af07ec2d89f2..42a01ddc8b0d 100644
--- a/helm/dagster/values.yaml
+++ b/helm/dagster/values.yaml
@@ -1061,18 +1061,19 @@ dagsterDaemon:
     #     class: ~
     #     config: {}
 
-  # Experimental feature to add fault tolerance to Dagster runs. The new Monitoring Daemon will
-  # perform health checks on run workers. If a run doesn't start within the timeout, it will be
-  # marked as failed. If a run had started but then the run worker crashed, the daemon will attempt
-  # to resume the run with a new run worker.
+  # Enables monitoring of run worker pods. If a run doesn't start within the timeout, it will be
+  # marked as failed. If a run had started but then the run worker crashed, the monitoring daemon
+  # will either fail the run or attempt to resume the run with a new run worker.
   runMonitoring:
     enabled: false
     # Timeout for runs to start (avoids runs hanging in STARTED)
     startTimeoutSeconds: 180
     # How often to check on in progress runs
     pollIntervalSeconds: 120
-    # Max number of times to attempt to resume a run with a new run worker. Defaults to 3 if the the
-    # run launcher supports resuming runs, otherwise defaults to 0.
+    # [Experimental] Max number of times to attempt to resume a run with a new run worker instead
+    # of failing the run if the run worker has crashed. Only works for runs using the
+    # `k8s_job_executor` to run each op in its own Kubernetes job. If this value is set to nil,
+    # defaults to 3 if using the K8sRunLauncher, otherwise defaults to 0.
     maxResumeRunAttempts: ~
 
   runRetries: