From 4c9cc8c18e8995632e399792ce4f7df397328e03 Mon Sep 17 00:00:00 2001 From: nthmost-orkes Date: Tue, 28 Apr 2026 11:42:59 -0700 Subject: [PATCH 1/3] docs: explain execute() RUNNING-after-timeout behavior and how to debug stuck workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses conductor-oss/getting-started#53. Documents that status='RUNNING' from execute() is expected when wait_for_seconds (default 10s) expires before the workflow completes — a common surprise when a worker exception triggers a 60s retry delay. - Expand execute() docstring with the wait_for_seconds vs retryDelaySeconds gotcha, and a code snippet for inspecting failed task details programmatically - Add a FAQ entry in README.md covering the same scenario with a fix and debug steps - Add a "Debugging a stuck workflow" section in docs/WORKFLOW.md, and a callout box under "Execute workflow synchronously" explaining the timing relationship - Document that WorkflowRun.reason_for_incompletion is deprecated; recommend get_workflow(id, include_tasks=True) + task.reason_for_incompletion instead --- README.md | 31 ++++++++++++ docs/WORKFLOW.md | 49 +++++++++++++++++++ .../workflow/executor/workflow_executor.py | 30 +++++++++++- 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 958dfbe35..0dc11c635 100644 --- a/README.md +++ b/README.md @@ -490,6 +490,37 @@ Yes. Conductor ensures workflows complete reliably even in the face of infrastru No. While Conductor excels at asynchronous orchestration, it also supports synchronous workflow execution when immediate results are required. +**Why did `execute()` return `status: RUNNING` with no output?** + +`execute()` blocks until the workflow finishes **or** `wait_for_seconds` elapses (default: 10 s), +whichever comes first. If it times out, you get `status='RUNNING'` — that is correct behavior, +not a bug. + +The most common cause: your worker raised an exception. Conductor marks the task FAILED and +schedules a retry after `retryDelaySeconds` (default: **60 s**). The default 10 s wait expires +while the retry is pending, so `execute()` returns before the workflow completes. + +**To fix**: increase `wait_for_seconds` to outlast the retry cycle: + +```python +# default retryDelaySeconds is 60 — wait long enough to cover one retry +run = executor.execute(name='my_workflow', version=1, workflow_input={...}, wait_for_seconds=70) +``` + +**To debug** when a workflow is stuck: + +```python +# Inspect task statuses and failure reasons +wf = executor.get_workflow(run.workflow_id, include_tasks=True) +for task in wf.tasks: + if task.status in ('FAILED', 'FAILED_WITH_TERMINAL_ERROR'): + print(task.reference_task_name, task.reason_for_incompletion) +``` + +You can also open the Conductor UI at `/execution/` — it shows each task's +status, retry count, and the worker exception message directly. Worker tracebacks are also logged +at ERROR level by the SDK in the `TaskHandler` process. + **Do I need to use a Conductor-specific framework?** No. Conductor is language and framework agnostic. Use your preferred language and framework — the [SDKs](https://github.com/conductor-oss/conductor#conductor-sdks) provide native integration for Python, Java, JavaScript, Go, C#, and more. diff --git a/docs/WORKFLOW.md b/docs/WORKFLOW.md index e7c2cde8e..2802ffc13 100644 --- a/docs/WORKFLOW.md +++ b/docs/WORKFLOW.md @@ -61,6 +61,55 @@ workflow_id = workflow_client.execute_workflow( ) ``` +> **`wait_for_seconds` and task retries** +> +> `execute()` / `execute_workflow()` block for at most `wait_for_seconds` (default: **10 s**). +> If the workflow is still running when the timer fires, the call returns with +> `status='RUNNING'` and empty output — this is expected behavior, not an error. +> +> The most common trigger: a worker exception. Conductor marks the task FAILED and waits +> `retryDelaySeconds` (default: **60 s**) before retrying. The default 10 s timeout expires +> during that wait, so you see `RUNNING`. Set `wait_for_seconds` to a value larger than +> `retryDelaySeconds` to ensure the call waits through at least one retry cycle: +> +> ```python +> run = executor.execute( +> name='my_workflow', version=1, workflow_input={...}, +> wait_for_seconds=70 # covers one retry at the default 60 s delay +> ) +> ``` + +#### Debugging a stuck workflow + +When a workflow returns `RUNNING` or never completes, use these steps to find out why. + +**1. Check the Conductor UI** + +Open `/execution/`. The timeline view shows each task's status, retry +count, and the worker exception message — usually the fastest way to diagnose a failure. + +**2. Inspect task statuses programmatically** + +`get_workflow` with `include_tasks=True` returns the full task list. Check failed tasks for +their `reason_for_incompletion`: + +```python +wf = executor.get_workflow(workflow_id, include_tasks=True) +for task in wf.tasks: + print(task.reference_task_name, task.status, task.reason_for_incompletion) +``` + +**3. Read the worker logs** + +When a worker function raises an exception, the SDK catches it, logs the traceback at ERROR +level, and reports the task as FAILED. Worker logs come from the `TaskHandler` process — check +the terminal output or your process manager's log stream. + +**Note on `reason_for_incompletion` on `WorkflowRun`** + +`WorkflowRun.reason_for_incompletion` is deprecated. Use `get_workflow(id, include_tasks=True)` +and read `task.reason_for_incompletion` on the specific failed task instead (see step 2 above). + ### Fetch a workflow execution #### Exclude tasks diff --git a/src/conductor/client/workflow/executor/workflow_executor.py b/src/conductor/client/workflow/executor/workflow_executor.py index 20074de09..116d036dd 100644 --- a/src/conductor/client/workflow/executor/workflow_executor.py +++ b/src/conductor/client/workflow/executor/workflow_executor.py @@ -91,8 +91,34 @@ def execute_workflow_with_return_strategy(self, request: StartWorkflowRequest, w def execute(self, name: str, version: Optional[int] = None, workflow_input: Any = None, wait_until_task_ref: Optional[str] = None, wait_for_seconds: int = 10, request_id: Optional[str] = None, correlation_id: Optional[str] = None, domain: Optional[str] = None) -> WorkflowRun: - """Executes a workflow with StartWorkflowRequest and waits for the completion of the workflow or until a - specific task in the workflow """ + """Execute a workflow synchronously and return the result. + + Blocks until the workflow reaches a terminal state (COMPLETED, FAILED, TIMED_OUT, + TERMINATED) or until ``wait_for_seconds`` elapses, whichever comes first. If the + timeout fires first, returns a ``WorkflowRun`` with ``status='RUNNING'`` and empty + output — this is normal behavior, not an error. + + **Common gotcha — RUNNING with no output after a worker exception:** + The default ``wait_for_seconds=10`` is shorter than the default task + ``retryDelaySeconds=60``. If your worker raises an exception, Conductor marks the task + FAILED and schedules a retry after 60 s. The 10 s wait expires while the retry is + pending, so you see ``status='RUNNING'``. Increase ``wait_for_seconds`` to outlast the + full retry cycle (e.g. ``wait_for_seconds=70`` for one retry with the default delay). + + **Debugging a RUNNING result:** + + - Open the Conductor UI at ``/execution/`` — the UI shows task + failures and worker exception messages directly. + - Fetch task details programmatically:: + + wf = executor.get_workflow(run.workflow_id, include_tasks=True) + for task in wf.tasks: + if task.status in ('FAILED', 'FAILED_WITH_TERMINAL_ERROR'): + print(task.reference_task_name, task.reason_for_incompletion) + + - Check ``TaskHandler`` / worker process logs for the Python traceback. Worker + exceptions are logged at ERROR level by the SDK before the task result is reported. + """ workflow_input = workflow_input or {} if request_id is None: request_id = str(uuid.uuid4()) From bf045f017d5b5dd4558bd6636e1d1299c797d291 Mon Sep 17 00:00:00 2001 From: nthmost-orkes Date: Tue, 28 Apr 2026 12:19:47 -0700 Subject: [PATCH 2/3] docs: tighten execute() docstring --- .../workflow/executor/workflow_executor.py | 35 ++++++------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/src/conductor/client/workflow/executor/workflow_executor.py b/src/conductor/client/workflow/executor/workflow_executor.py index 116d036dd..b65fe8466 100644 --- a/src/conductor/client/workflow/executor/workflow_executor.py +++ b/src/conductor/client/workflow/executor/workflow_executor.py @@ -91,33 +91,20 @@ def execute_workflow_with_return_strategy(self, request: StartWorkflowRequest, w def execute(self, name: str, version: Optional[int] = None, workflow_input: Any = None, wait_until_task_ref: Optional[str] = None, wait_for_seconds: int = 10, request_id: Optional[str] = None, correlation_id: Optional[str] = None, domain: Optional[str] = None) -> WorkflowRun: - """Execute a workflow synchronously and return the result. + """Execute a workflow synchronously and wait for it to complete. - Blocks until the workflow reaches a terminal state (COMPLETED, FAILED, TIMED_OUT, - TERMINATED) or until ``wait_for_seconds`` elapses, whichever comes first. If the - timeout fires first, returns a ``WorkflowRun`` with ``status='RUNNING'`` and empty - output — this is normal behavior, not an error. + Returns when the workflow reaches a terminal state or ``wait_for_seconds`` elapses. + If the timeout fires first, returns ``status='RUNNING'`` with empty output — not an error. - **Common gotcha — RUNNING with no output after a worker exception:** - The default ``wait_for_seconds=10`` is shorter than the default task - ``retryDelaySeconds=60``. If your worker raises an exception, Conductor marks the task - FAILED and schedules a retry after 60 s. The 10 s wait expires while the retry is - pending, so you see ``status='RUNNING'``. Increase ``wait_for_seconds`` to outlast the - full retry cycle (e.g. ``wait_for_seconds=70`` for one retry with the default delay). + **Getting RUNNING back?** The default ``wait_for_seconds=10`` is shorter than the + default task ``retryDelaySeconds=60``. A failing worker triggers a 60 s retry wait, + so the 10 s timeout fires while the retry is pending. Raise ``wait_for_seconds`` + (e.g. 70) or inspect failed tasks:: - **Debugging a RUNNING result:** - - - Open the Conductor UI at ``/execution/`` — the UI shows task - failures and worker exception messages directly. - - Fetch task details programmatically:: - - wf = executor.get_workflow(run.workflow_id, include_tasks=True) - for task in wf.tasks: - if task.status in ('FAILED', 'FAILED_WITH_TERMINAL_ERROR'): - print(task.reference_task_name, task.reason_for_incompletion) - - - Check ``TaskHandler`` / worker process logs for the Python traceback. Worker - exceptions are logged at ERROR level by the SDK before the task result is reported. + wf = executor.get_workflow(run.workflow_id, include_tasks=True) + for task in wf.tasks: + if task.status in ('FAILED', 'FAILED_WITH_TERMINAL_ERROR'): + print(task.reference_task_name, task.reason_for_incompletion) """ workflow_input = workflow_input or {} if request_id is None: From 4e5e0bf58808a01ea86edbbbb9afb8b17efffbf8 Mon Sep 17 00:00:00 2001 From: nthmost-orkes Date: Tue, 28 Apr 2026 12:21:23 -0700 Subject: [PATCH 3/3] docs: update docstring heading --- src/conductor/client/workflow/executor/workflow_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conductor/client/workflow/executor/workflow_executor.py b/src/conductor/client/workflow/executor/workflow_executor.py index b65fe8466..eb870941e 100644 --- a/src/conductor/client/workflow/executor/workflow_executor.py +++ b/src/conductor/client/workflow/executor/workflow_executor.py @@ -96,7 +96,7 @@ def execute(self, name: str, version: Optional[int] = None, workflow_input: Any Returns when the workflow reaches a terminal state or ``wait_for_seconds`` elapses. If the timeout fires first, returns ``status='RUNNING'`` with empty output — not an error. - **Getting RUNNING back?** The default ``wait_for_seconds=10`` is shorter than the + **Getting RUNNING with no output after a worker exception?** The default ``wait_for_seconds=10`` is shorter than the default task ``retryDelaySeconds=60``. A failing worker triggers a 60 s retry wait, so the 10 s timeout fires while the retry is pending. Raise ``wait_for_seconds`` (e.g. 70) or inspect failed tasks::