Fix maxResumeRunAttempts being explicitly enabled even if you set it …

…to 0 (#11511) Summary: This fix makes maxResumeRunAttempts equal to 0 when it is explicitly set to 0. I feel a little weird about setting this experimental feature on by default when you turn on run monitoring in general, but changign that would be a breaking change that can probably wait until 1.2 (where I think we should turn run monitoring on by default but this feature off by default until we have more confidence in it) ### Summary & Motivation ### How I Tested These Changes
dagster-io · Jan 5, 2023 · 6590ed9 · 6590ed9
1 parent e9822ed
commit 6590ed9
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 7 deletions.
diff --git a/helm/dagster/schema/schema_tests/test_dagster_daemon.py b/helm/dagster/schema/schema_tests/test_dagster_daemon.py
@@ -207,6 +207,20 @@ def test_queued_run_coordinator_unique_values(
     ]
 
 
+def test_run_monitoring_defaults(
+    instance_template: HelmTemplate,
+):  # pylint: disable=redefined-outer-name
+    helm_values = DagsterHelmValues.construct()
+
+    configmaps = instance_template.render(helm_values)
+
+    assert len(configmaps) == 1
+
+    instance = yaml.full_load(configmaps[0].data["dagster.yaml"])
+
+    assert "run_monitoring" not in instance
+
+
 def test_run_monitoring(
     instance_template: HelmTemplate,
 ):  # pylint: disable=redefined-outer-name
@@ -222,6 +236,42 @@ def test_run_monitoring(
 
     assert instance["run_monitoring"]["enabled"] is True
 
+    assert not "max_resume_run_attempts" in instance["run_monitoring"]
+
+
+def test_run_monitoring_no_max_resume_run_attempts(
+    instance_template: HelmTemplate,
+):  # pylint: disable=redefined-outer-name
+    helm_values = DagsterHelmValues.construct(
+        dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 0})
+    )
+
+    configmaps = instance_template.render(helm_values)
+
+    assert len(configmaps) == 1
+
+    instance = yaml.full_load(configmaps[0].data["dagster.yaml"])
+
+    assert instance["run_monitoring"]["enabled"] is True
+    assert instance["run_monitoring"]["max_resume_run_attempts"] == 0
+
+
+def test_run_monitoring_set_max_resume_run_attempts(
+    instance_template: HelmTemplate,
+):  # pylint: disable=redefined-outer-name
+    helm_values = DagsterHelmValues.construct(
+        dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 2})
+    )
+
+    configmaps = instance_template.render(helm_values)
+
+    assert len(configmaps) == 1
+
+    instance = yaml.full_load(configmaps[0].data["dagster.yaml"])
+
+    assert instance["run_monitoring"]["enabled"] is True
+    assert instance["run_monitoring"]["max_resume_run_attempts"] == 2
+
 
 def test_run_retries(
     instance_template: HelmTemplate,

diff --git a/helm/dagster/templates/configmap-instance.yaml b/helm/dagster/templates/configmap-instance.yaml
@@ -90,7 +90,7 @@ data:
     run_monitoring:
       enabled: {{ $runMonitoring.enabled }}
       start_timeout_seconds:  {{ $runMonitoring.startTimeoutSeconds }}
-      {{- if $runMonitoring.maxResumeRunAttempts }}
+      {{- if not (kindIs "invalid" $runMonitoring.maxResumeRunAttempts) }}
       max_resume_run_attempts: {{ $runMonitoring.maxResumeRunAttempts }}
       {{- end }}
       poll_interval_seconds: {{ $runMonitoring.pollIntervalSeconds }}

diff --git a/helm/dagster/values.yaml b/helm/dagster/values.yaml
@@ -1061,18 +1061,19 @@ dagsterDaemon:
     #     class: ~
     #     config: {}
 
-  # Experimental feature to add fault tolerance to Dagster runs. The new Monitoring Daemon will
-  # perform health checks on run workers. If a run doesn't start within the timeout, it will be
-  # marked as failed. If a run had started but then the run worker crashed, the daemon will attempt
-  # to resume the run with a new run worker.
+  # Enables monitoring of run worker pods. If a run doesn't start within the timeout, it will be
+  # marked as failed. If a run had started but then the run worker crashed, the monitoring daemon
+  # will either fail the run or attempt to resume the run with a new run worker.
   runMonitoring:
     enabled: false
     # Timeout for runs to start (avoids runs hanging in STARTED)
     startTimeoutSeconds: 180
     # How often to check on in progress runs
     pollIntervalSeconds: 120
-    # Max number of times to attempt to resume a run with a new run worker. Defaults to 3 if the the
-    # run launcher supports resuming runs, otherwise defaults to 0.
+    # [Experimental] Max number of times to attempt to resume a run with a new run worker instead
+    # of failing the run if the run worker has crashed. Only works for runs using the
+    # `k8s_job_executor` to run each op in its own Kubernetes job. If this value is set to nil,
+    # defaults to 3 if using the K8sRunLauncher, otherwise defaults to 0.
     maxResumeRunAttempts: ~
 
   runRetries: