From e50be0bed29f9cee90b67850c5e74e9b005f89a5 Mon Sep 17 00:00:00 2001 From: Jerry Mannil Date: Tue, 7 Jun 2022 13:24:35 -0700 Subject: [PATCH] Graceful exit on failures for multi-node runs * Use Popen.terminate() to stop the child processes gracefully; Kill them if terminate doesn't work * The Popen.kill() command cause the training processes to end abruptly. This may cause the child processes to become zombies without communicating properly to the parent process about the kill signal. So the ssh session continue to wait for signals from the child processes, causing it to not return back to the pdsh command Fixes microsoft#1995 --- deepspeed/launcher/launch.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index 06321cbf90b3..7b11c0c188d3 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -15,6 +15,7 @@ import base64 import time import signal +import psutil from collections import defaultdict from argparse import ArgumentParser, REMAINDER @@ -87,6 +88,21 @@ def parse_args(): return parser.parse_args() +# Adapted from https://psutil.readthedocs.io/en/latest/#kill-process-tree +def terminate_process_tree(pid): + process = psutil.Process(pid) + children = process.children(recursive=True) + children.append(process) + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + gone, alive = psutil.wait_procs(children, timeout=30) + for p in alive: + p.kill() + + def main(): args = parse_args() current_env = os.environ.copy() @@ -177,7 +193,7 @@ def sigkill_handler(signum, frame): for process in processes: logger.info(f"Killing subprocess {process.pid}") try: - process.kill() + terminate_process_tree(process.pid) except Exception: pass if last_return_code is not None: