Skip to content

Commit

Permalink
Fix maxResumeRunAttempts being explicitly enabled even if you set it …
Browse files Browse the repository at this point in the history
…to 0 (#11511)

Summary:
This fix makes maxResumeRunAttempts equal to 0 when it is explicitly set
to 0.

I feel a little weird about setting this experimental feature on by
default when you turn on run monitoring in general, but changign that
would be a breaking change that can probably wait until 1.2 (where I
think we should turn run monitoring on by default but this feature off
by default until we have more confidence in it)

### Summary & Motivation

### How I Tested These Changes
  • Loading branch information
gibsondan committed Jan 5, 2023
1 parent e9822ed commit 6590ed9
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 7 deletions.
50 changes: 50 additions & 0 deletions helm/dagster/schema/schema_tests/test_dagster_daemon.py
Expand Up @@ -207,6 +207,20 @@ def test_queued_run_coordinator_unique_values(
]


def test_run_monitoring_defaults(
instance_template: HelmTemplate,
): # pylint: disable=redefined-outer-name
helm_values = DagsterHelmValues.construct()

configmaps = instance_template.render(helm_values)

assert len(configmaps) == 1

instance = yaml.full_load(configmaps[0].data["dagster.yaml"])

assert "run_monitoring" not in instance


def test_run_monitoring(
instance_template: HelmTemplate,
): # pylint: disable=redefined-outer-name
Expand All @@ -222,6 +236,42 @@ def test_run_monitoring(

assert instance["run_monitoring"]["enabled"] is True

assert not "max_resume_run_attempts" in instance["run_monitoring"]


def test_run_monitoring_no_max_resume_run_attempts(
instance_template: HelmTemplate,
): # pylint: disable=redefined-outer-name
helm_values = DagsterHelmValues.construct(
dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 0})
)

configmaps = instance_template.render(helm_values)

assert len(configmaps) == 1

instance = yaml.full_load(configmaps[0].data["dagster.yaml"])

assert instance["run_monitoring"]["enabled"] is True
assert instance["run_monitoring"]["max_resume_run_attempts"] == 0


def test_run_monitoring_set_max_resume_run_attempts(
instance_template: HelmTemplate,
): # pylint: disable=redefined-outer-name
helm_values = DagsterHelmValues.construct(
dagsterDaemon=Daemon.construct(runMonitoring={"enabled": True, "maxResumeRunAttempts": 2})
)

configmaps = instance_template.render(helm_values)

assert len(configmaps) == 1

instance = yaml.full_load(configmaps[0].data["dagster.yaml"])

assert instance["run_monitoring"]["enabled"] is True
assert instance["run_monitoring"]["max_resume_run_attempts"] == 2


def test_run_retries(
instance_template: HelmTemplate,
Expand Down
2 changes: 1 addition & 1 deletion helm/dagster/templates/configmap-instance.yaml
Expand Up @@ -90,7 +90,7 @@ data:
run_monitoring:
enabled: {{ $runMonitoring.enabled }}
start_timeout_seconds: {{ $runMonitoring.startTimeoutSeconds }}
{{- if $runMonitoring.maxResumeRunAttempts }}
{{- if not (kindIs "invalid" $runMonitoring.maxResumeRunAttempts) }}
max_resume_run_attempts: {{ $runMonitoring.maxResumeRunAttempts }}
{{- end }}
poll_interval_seconds: {{ $runMonitoring.pollIntervalSeconds }}
Expand Down
13 changes: 7 additions & 6 deletions helm/dagster/values.yaml
Expand Up @@ -1061,18 +1061,19 @@ dagsterDaemon:
# class: ~
# config: {}

# Experimental feature to add fault tolerance to Dagster runs. The new Monitoring Daemon will
# perform health checks on run workers. If a run doesn't start within the timeout, it will be
# marked as failed. If a run had started but then the run worker crashed, the daemon will attempt
# to resume the run with a new run worker.
# Enables monitoring of run worker pods. If a run doesn't start within the timeout, it will be
# marked as failed. If a run had started but then the run worker crashed, the monitoring daemon
# will either fail the run or attempt to resume the run with a new run worker.
runMonitoring:
enabled: false
# Timeout for runs to start (avoids runs hanging in STARTED)
startTimeoutSeconds: 180
# How often to check on in progress runs
pollIntervalSeconds: 120
# Max number of times to attempt to resume a run with a new run worker. Defaults to 3 if the the
# run launcher supports resuming runs, otherwise defaults to 0.
# [Experimental] Max number of times to attempt to resume a run with a new run worker instead
# of failing the run if the run worker has crashed. Only works for runs using the
# `k8s_job_executor` to run each op in its own Kubernetes job. If this value is set to nil,
# defaults to 3 if using the K8sRunLauncher, otherwise defaults to 0.
maxResumeRunAttempts: ~

runRetries:
Expand Down

0 comments on commit 6590ed9

Please sign in to comment.