From 6590ed9ea9521c03baded1e0e3ed12cc4c196f27 Mon Sep 17 00:00:00 2001 From: gibsondan Date: Thu, 5 Jan 2023 10:58:53 -0600 Subject: [PATCH] Fix maxResumeRunAttempts being explicitly enabled even if you set it to 0 (#11511) Summary: This fix makes maxResumeRunAttempts equal to 0 when it is explicitly set to 0. I feel a little weird about setting this experimental feature on by default when you turn on run monitoring in general, but changign that would be a breaking change that can probably wait until 1.2 (where I think we should turn run monitoring on by default but this feature off by default until we have more confidence in it) ### Summary & Motivation ### How I Tested These Changes --- .../schema_tests/test_dagster_daemon.py | 50 +++++++++++++++++++ .../dagster/templates/configmap-instance.yaml | 2 +- helm/dagster/values.yaml | 13 ++--- 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/helm/dagster/schema/schema_tests/test_dagster_daemon.py b/helm/dagster/schema/schema_tests/test_dagster_daemon.py index 777a03f61b77..22d584afd02e 100644 --- a/helm/dagster/schema/schema_tests/test_dagster_daemon.py +++ b/helm/dagster/schema/schema_tests/test_dagster_daemon.py @@ -207,6 +207,20 @@ def test_queued_run_coordinator_unique_values( ] +def test_run_monitoring_defaults( + instance_template: HelmTemplate, +): # pylint: disable=redefined-outer-name + helm_values = DagsterHelmValues.construct() + + configmaps = instance_template.render(helm_values) + + assert len(configmaps) == 1 + + instance = yaml.full_load(configmaps[0].data["dagster.yaml"]) + + assert "run_monitoring" not in instance + + def test_run_monitoring( instance_template: HelmTemplate, ): # pylint: disable=redefined-outer-name @@ -222,6 +236,42 @@ def test_run_monitoring( assert instance["run_monitoring"]["enabled"] is True + assert not "max_resume_run_attempts" in instance["run_monitoring"] + + +def test_run_monitoring_no_max_resume_run_attempts( + instance_template: HelmTemplate, +): # pylint: disable=redefined-outer-name + helm_values = DagsterHelmValues.construct( + dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 0}) + ) + + configmaps = instance_template.render(helm_values) + + assert len(configmaps) == 1 + + instance = yaml.full_load(configmaps[0].data["dagster.yaml"]) + + assert instance["run_monitoring"]["enabled"] is True + assert instance["run_monitoring"]["max_resume_run_attempts"] == 0 + + +def test_run_monitoring_set_max_resume_run_attempts( + instance_template: HelmTemplate, +): # pylint: disable=redefined-outer-name + helm_values = DagsterHelmValues.construct( + dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 2}) + ) + + configmaps = instance_template.render(helm_values) + + assert len(configmaps) == 1 + + instance = yaml.full_load(configmaps[0].data["dagster.yaml"]) + + assert instance["run_monitoring"]["enabled"] is True + assert instance["run_monitoring"]["max_resume_run_attempts"] == 2 + def test_run_retries( instance_template: HelmTemplate, diff --git a/helm/dagster/templates/configmap-instance.yaml b/helm/dagster/templates/configmap-instance.yaml index 931f5a336417..4cc248fbeea1 100644 --- a/helm/dagster/templates/configmap-instance.yaml +++ b/helm/dagster/templates/configmap-instance.yaml @@ -90,7 +90,7 @@ data: run_monitoring: enabled: {{ $runMonitoring.enabled }} start_timeout_seconds: {{ $runMonitoring.startTimeoutSeconds }} - {{- if $runMonitoring.maxResumeRunAttempts }} + {{- if not (kindIs "invalid" $runMonitoring.maxResumeRunAttempts) }} max_resume_run_attempts: {{ $runMonitoring.maxResumeRunAttempts }} {{- end }} poll_interval_seconds: {{ $runMonitoring.pollIntervalSeconds }} diff --git a/helm/dagster/values.yaml b/helm/dagster/values.yaml index af07ec2d89f2..42a01ddc8b0d 100644 --- a/helm/dagster/values.yaml +++ b/helm/dagster/values.yaml @@ -1061,18 +1061,19 @@ dagsterDaemon: # class: ~ # config: {} - # Experimental feature to add fault tolerance to Dagster runs. The new Monitoring Daemon will - # perform health checks on run workers. If a run doesn't start within the timeout, it will be - # marked as failed. If a run had started but then the run worker crashed, the daemon will attempt - # to resume the run with a new run worker. + # Enables monitoring of run worker pods. If a run doesn't start within the timeout, it will be + # marked as failed. If a run had started but then the run worker crashed, the monitoring daemon + # will either fail the run or attempt to resume the run with a new run worker. runMonitoring: enabled: false # Timeout for runs to start (avoids runs hanging in STARTED) startTimeoutSeconds: 180 # How often to check on in progress runs pollIntervalSeconds: 120 - # Max number of times to attempt to resume a run with a new run worker. Defaults to 3 if the the - # run launcher supports resuming runs, otherwise defaults to 0. + # [Experimental] Max number of times to attempt to resume a run with a new run worker instead + # of failing the run if the run worker has crashed. Only works for runs using the + # `k8s_job_executor` to run each op in its own Kubernetes job. If this value is set to nil, + # defaults to 3 if using the K8sRunLauncher, otherwise defaults to 0. maxResumeRunAttempts: ~ runRetries: