From cc9514fed666f0104a7e89c7c361b97353a575e6 Mon Sep 17 00:00:00 2001 From: Hilary James Oliver Date: Sat, 12 Aug 2017 19:45:17 +1200 Subject: [PATCH] Polling logic overhauled. * Allow reset to 'submitted' or 'running'. * Allow polling of succeeded or failed tasks (but not succeeded by default). * Poll to confirm, if a message implies a state reversal. * Remove 'enable resurrection' - all tasks can return from failed. * Document how to handle preemption in light of these changes. --- bin/cylc-poll | 6 +- conf/cylc.lang | 1 - conf/cylc.xml | 1 - doc/src/cylc-user-guide/cug.tex | 44 ++---- doc/src/cylc-user-guide/suiterc.tex | 16 -- lib/cylc/cfgspec/suite.py | 2 +- lib/cylc/gui/app_gcylc.py | 52 ++----- .../network/https/suite_command_server.py | 5 +- lib/cylc/scheduler.py | 14 +- lib/cylc/task_events_mgr.py | 137 ++++++++++-------- lib/cylc/task_job_mgr.py | 82 +++++++---- lib/cylc/task_outputs.py | 5 + lib/cylc/task_pool.py | 3 +- lib/cylc/task_proxy.py | 2 +- lib/cylc/task_state.py | 16 +- .../cylc-get-config/00-simple/section2.stdout | 13 -- tests/cylc-get-config/04-dummy-mode-output.t | 4 +- tests/cylc-poll/12-reverse-state.t | 32 ++++ .../cylc-poll/12-reverse-state/reference.log | 4 + tests/cylc-poll/12-reverse-state/suite.rc | 43 ++++++ 20 files changed, 276 insertions(+), 206 deletions(-) create mode 100755 tests/cylc-poll/12-reverse-state.t create mode 100644 tests/cylc-poll/12-reverse-state/reference.log create mode 100644 tests/cylc-poll/12-reverse-state/suite.rc diff --git a/bin/cylc-poll b/bin/cylc-poll index 18c4f241078..2d4dc665849 100755 --- a/bin/cylc-poll +++ b/bin/cylc-poll @@ -51,6 +51,10 @@ def main(): ('REG', 'Suite name'), ('[TASKID ...]', 'Task identifiers')]) + parser.add_option( + "-s", "--succeeded", help="Allow polling of succeeded tasks.", + action="store_true", default=False, dest="poll_all") + options, args = parser.parse_args() suite = args.pop(0) @@ -63,7 +67,7 @@ def main(): options.comms_timeout, my_uuid=options.set_uuid, print_uuid=options.print_uuid) items = parser.parse_multitask_compat(options, args) - pclient.put_command('poll_tasks', items=items) + pclient.put_command('poll_tasks', items=items, poll_all=options.poll_all) if __name__ == "__main__": diff --git a/conf/cylc.lang b/conf/cylc.lang index 87b19ac1f2b..d0a307bdf8b 100644 --- a/conf/cylc.lang +++ b/conf/cylc.lang @@ -179,7 +179,6 @@ exclude at start-up exclude env-script - enable resurrection disable automatic shutdown description default node attributes diff --git a/conf/cylc.xml b/conf/cylc.xml index 7c59e727ff5..28f50c37d77 100644 --- a/conf/cylc.xml +++ b/conf/cylc.xml @@ -106,7 +106,6 @@ - diff --git a/doc/src/cylc-user-guide/cug.tex b/doc/src/cylc-user-guide/cug.tex index 51dbeff860e..73aae3d2ea7 100644 --- a/doc/src/cylc-user-guide/cug.tex +++ b/doc/src/cylc-user-guide/cug.tex @@ -6742,36 +6742,20 @@ \subsection{Handling Job Preemption} or suspend running low priority jobs in order to make way for high priority jobs. The preempted jobs may then be automatically restarted by the resource manager, from the same point (if suspended) or requeued -to run again from the start (if killed). If a running cylc task gets -suspended or hard-killed -(\lstinline=kill -9 = is not a trappable signal so cylc cannot detect -task failure in this case) and then later restarted, it will just appear -to cylc as if it takes longer than normal to run. If the job is -soft-killed the signal will be trapped by the task job script and a -failure message sent, resulting in cylc putting the task into the failed -state. When the preempted task restarts and sends its started message -cylc would normally treat this as an error condition (a dead task is not -supposed to be sending messages) - a warning will be logged and the task -will remain in the failed state. However, if you know that preemption is -possible on your system you can tell cylc that affected tasks should be -resurrected from the dead, to carry on as normal if progress messages -start coming in again after a failure: - -\lstset{language=suiterc} -\begin{lstlisting} -# ... -[runtime] - [[HPC]] - enable resurrection = True - [[TaskFoo]] - inherit = HPC -# ... -\end{lstlisting} - -To test this in any suite, manually kill a running task then, after cylc -registers the task failed, resubmit the killed job manually by -cutting-and-pasting the original job submission command from the suite -stdout stream. +to run again from the start (if killed). + +Suspended jobs will poll as still running (their job status file says they +started running, and they still appear in the resource manager queue). +Loadleveler jobs that are preempted by kill-and-requeue ("job vacation") are +automatically returned to the submitted state by Cylc. This is possible +because Loadleveler sends the SIGUSR1 signal before SIGKILL for preemption. +Other batch schedulers just send SIGTERM before SIGKILL as normal, so Cylc +cannot distinguish a preemption job kill from a normal job kill. After this the +job will poll as failed (correctly, because it was killed, and the job status +file records that). To handle this kind of preemption automatically you could +use a task failed or retry event handler that queries the batch scheduler queue +(after an appropriate delay if necessary) and then, if the job has been +requeued, uses \lstinline=cylc reset= to reset the task to the submitted state. \subsection{Manual Task Triggering and Edit-Run} diff --git a/doc/src/cylc-user-guide/suiterc.tex b/doc/src/cylc-user-guide/suiterc.tex index a7c72ab4098..3137edf762c 100644 --- a/doc/src/cylc-user-guide/suiterc.tex +++ b/doc/src/cylc-user-guide/suiterc.tex @@ -1210,22 +1210,6 @@ \subsection{[runtime]} instances of the task will share the same workspace. Consider the effect on cycle point offset housekeeping of work directories before doing this. -\paragraph[enable resurrection]{ [runtime] \textrightarrow [[\_\_NAME\_\_]] \textrightarrow enable resurrection} - -If a message is received from a failed task cylc will normally treat -this as an error condition, issue a warning, and leave the task in the -``failed'' state. But if ``enable resurrection'' is switched on failed -tasks can come back from the dead: if the same task job script is -executed again cylc will put the task back into the running state and -continue as normal when the started message is received. This can be -used to handle HPC-style job preemption wherein a resource manager may -kill a running task and reschedule it to run again later, to make way -for a job with higher immediate priority. See also~\ref{PreemptionHPC} -\begin{myitemize} -\item {\em type:} boolean -\item {\em default:} False -\end{myitemize} - \paragraph[{[[[}meta{]]]}]{[runtime] \textrightarrow [[\_\_NAME\_\_]] \textrightarrow [[[meta]]]} Section containing metadata items for this task or family namespace. Several items diff --git a/lib/cylc/cfgspec/suite.py b/lib/cylc/cfgspec/suite.py index b6bca9a8f41..68bca25254a 100644 --- a/lib/cylc/cfgspec/suite.py +++ b/lib/cylc/cfgspec/suite.py @@ -336,7 +336,6 @@ def _coerce_parameter_list(value, keys, _): 'script': vdr(vtype='string', default=""), 'post-script': vdr(vtype='string', default=""), 'extra log files': vdr(vtype='string_list', default=[]), - 'enable resurrection': vdr(vtype='boolean', default=False), 'work sub-directory': vdr(vtype='string'), 'meta': { 'title': vdr(vtype='string', default=""), @@ -531,6 +530,7 @@ def upg(cfg, descr): u.obsolete('7.2.2', ['cylc', 'simulation mode']) u.obsolete('7.2.2', ['runtime', '__MANY__', 'dummy mode']) u.obsolete('7.2.2', ['runtime', '__MANY__', 'simulation mode']) + u.obsolete('7.5.0', ['runtime', '__MANY__', 'enable resurrection']) u.upgrade() diff --git a/lib/cylc/gui/app_gcylc.py b/lib/cylc/gui/app_gcylc.py index 3e7c5b44f07..c7aaba9de7f 100644 --- a/lib/cylc/gui/app_gcylc.py +++ b/lib/cylc/gui/app_gcylc.py @@ -66,11 +66,10 @@ from cylc.cfgspec.gcylc import gcfg from cylc.wallclock import get_current_time_string from cylc.task_state import ( - TASK_STATUSES_ALL, TASK_STATUSES_RESTRICTED, + TASK_STATUSES_ALL, TASK_STATUSES_RESTRICTED, TASK_STATUSES_CAN_RESET_TO, TASK_STATUSES_WITH_JOB_SCRIPT, TASK_STATUSES_WITH_JOB_LOGS, - TASK_STATUSES_TRIGGERABLE, TASK_STATUSES_ACTIVE, - TASK_STATUS_WAITING, TASK_STATUS_HELD, TASK_STATUS_READY, - TASK_STATUS_RUNNING, TASK_STATUS_SUCCEEDED, TASK_STATUS_FAILED) + TASK_STATUSES_TRIGGERABLE, TASK_STATUSES_ACTIVE, TASK_STATUS_RUNNING, + TASK_STATUS_HELD, TASK_STATUS_FAILED) from cylc.task_state_prop import get_status_prop @@ -1500,42 +1499,15 @@ def get_right_click_menu(self, task_ids, t_states, task_is_family=False, # graph-view so use connect_right_click_sub_menu instead of # item.connect - reset_ready_item = gtk.ImageMenuItem('"%s"' % TASK_STATUS_READY) - reset_img = gtk.image_new_from_stock( - gtk.STOCK_CONVERT, gtk.ICON_SIZE_MENU) - reset_ready_item.set_image(reset_img) - reset_menu.append(reset_ready_item) - self.connect_right_click_sub_menu(is_graph_view, reset_ready_item, - self.reset_task_state, task_ids, - TASK_STATUS_READY) - - reset_waiting_item = gtk.ImageMenuItem('"%s"' % TASK_STATUS_WAITING) - reset_img = gtk.image_new_from_stock( - gtk.STOCK_CONVERT, gtk.ICON_SIZE_MENU) - reset_waiting_item.set_image(reset_img) - reset_menu.append(reset_waiting_item) - self.connect_right_click_sub_menu(is_graph_view, reset_waiting_item, - self.reset_task_state, task_ids, - TASK_STATUS_WAITING) - - reset_succeeded_item = gtk.ImageMenuItem( - '"%s"' % TASK_STATUS_SUCCEEDED) - reset_img = gtk.image_new_from_stock(gtk.STOCK_CONVERT, - gtk.ICON_SIZE_MENU) - reset_succeeded_item.set_image(reset_img) - reset_menu.append(reset_succeeded_item) - self.connect_right_click_sub_menu(is_graph_view, reset_succeeded_item, - self.reset_task_state, task_ids, - TASK_STATUS_SUCCEEDED) - - reset_failed_item = gtk.ImageMenuItem('"%s"' % TASK_STATUS_FAILED) - reset_img = gtk.image_new_from_stock(gtk.STOCK_CONVERT, - gtk.ICON_SIZE_MENU) - reset_failed_item.set_image(reset_img) - reset_menu.append(reset_failed_item) - self.connect_right_click_sub_menu(is_graph_view, reset_failed_item, - self.reset_task_state, task_ids, - TASK_STATUS_FAILED) + for status in TASK_STATUSES_CAN_RESET_TO: + reset_item = gtk.ImageMenuItem('"%s"' % status) + reset_img = gtk.image_new_from_stock( + gtk.STOCK_CONVERT, gtk.ICON_SIZE_MENU) + reset_item.set_image(reset_img) + reset_menu.append(reset_item) + self.connect_right_click_sub_menu(is_graph_view, reset_item, + self.reset_task_state, task_ids, + status) spawn_item = gtk.ImageMenuItem('Force spawn') img = gtk.image_new_from_stock(gtk.STOCK_ADD, gtk.ICON_SIZE_MENU) diff --git a/lib/cylc/network/https/suite_command_server.py b/lib/cylc/network/https/suite_command_server.py index c5e83a6bd98..82794505d31 100644 --- a/lib/cylc/network/https/suite_command_server.py +++ b/lib/cylc/network/https/suite_command_server.py @@ -165,10 +165,11 @@ def reload_suite(self): @cherrypy.expose @cherrypy.tools.json_out() - def poll_tasks(self, items=None): + def poll_tasks(self, items=None, poll_all=False): if items is not None and not isinstance(items, list): items = [items] - return self._put("poll_tasks", (items,)) + return self._put("poll_tasks", (items,), + {"poll_all": poll_all in ['True', True]}) @cherrypy.expose @cherrypy.tools.json_out() diff --git a/lib/cylc/scheduler.py b/lib/cylc/scheduler.py index 87578139510..e1319bcef15 100644 --- a/lib/cylc/scheduler.py +++ b/lib/cylc/scheduler.py @@ -498,7 +498,8 @@ def process_queued_task_messages(self): if itask.identity in task_id_messages: for priority, message in task_id_messages[itask.identity]: self.task_events_mgr.process_message( - itask, priority, message, is_incoming=True) + itask, priority, message, + self.task_job_mgr.poll_task_jobs, is_incoming=True) def process_command_queue(self): """Process queued commands.""" @@ -693,12 +694,17 @@ def command_release_tasks(self, items): """Release tasks.""" return self.pool.release_tasks(items) - def command_poll_tasks(self, items=None): - """Poll all tasks or a task/family if options are provided.""" + def command_poll_tasks(self, items=None, poll_all=False): + """Poll pollable tasks or a task/family if options are provided. + + Don't poll succeeded tasks unless poll_all is True. + + """ if self.run_mode == 'simulation': return itasks, bad_items = self.pool.filter_task_proxies(items) - self.task_job_mgr.poll_task_jobs(self.suite, itasks, items is not None) + self.task_job_mgr.poll_task_jobs(self.suite, itasks, items is not None, + poll_all=poll_all) return len(bad_items) def command_kill_tasks(self, items=None): diff --git a/lib/cylc/task_events_mgr.py b/lib/cylc/task_events_mgr.py index c7bf983d833..17cd3b61512 100644 --- a/lib/cylc/task_events_mgr.py +++ b/lib/cylc/task_events_mgr.py @@ -47,10 +47,9 @@ from cylc.task_action_timer import TaskActionTimer from cylc.task_message import TaskMessage from cylc.task_state import ( - TASK_STATUSES_ACTIVE, TASK_STATUS_READY, TASK_STATUS_SUBMITTED, - TASK_STATUS_SUBMIT_RETRYING, TASK_STATUS_SUBMIT_FAILED, - TASK_STATUS_RUNNING, TASK_STATUS_RETRYING, TASK_STATUS_FAILED, - TASK_STATUS_SUCCEEDED) + TASK_STATUS_READY, TASK_STATUS_SUBMITTED, TASK_STATUS_SUBMIT_RETRYING, + TASK_STATUS_SUBMIT_FAILED, TASK_STATUS_RUNNING, TASK_STATUS_RETRYING, + TASK_STATUS_FAILED, TASK_STATUS_SUCCEEDED) from cylc.task_outputs import ( TASK_OUTPUT_SUBMITTED, TASK_OUTPUT_STARTED, TASK_OUTPUT_SUCCEEDED, TASK_OUTPUT_FAILED) @@ -242,16 +241,24 @@ def process_events(self, schd_ctx): elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys) - def process_message(self, itask, priority, message, poll_event_time=None, - is_incoming=False): + def process_message(self, itask, priority, message, poll_func, + poll_event_time=None, is_incoming=False): """Parse an incoming task message and update task state. - Incoming is e.g. "succeeded at