deepmodeling · njzjz · May 26, 2024 · May 21, 2024 · May 21, 2024 · May 23, 2024
diff --git a/doc/index.rst b/doc/index.rst
@@ -6,7 +6,7 @@
 DPDispatcher's documentation
 ======================================
 
-DPDispatcher is a Python package used to generate HPC (High Performance Computing) scheduler systems (Slurm/PBS/LSF/dpcloudserver) jobs input scripts and submit these scripts to HPC systems and poke until they finish.
+DPDispatcher is a Python package used to generate HPC (High Performance Computing) scheduler systems (Slurm/PBS/LSF/JH_SCheduler/dpcloudserver) jobs input scripts and submit these scripts to HPC systems and poke until they finish.
 
 DPDispatcher will monitor (poke) until these jobs finish and download the results files (if these jobs is running on remote systems connected by SSH).
 

diff --git a/dpdispatcher/machines/JH_UniScheduler.py b/dpdispatcher/machines/JH_UniScheduler.py
@@ -0,0 +1,174 @@
+import shlex
+from typing import List
+
+from dargs import Argument
+
+from dpdispatcher.dlog import dlog
+from dpdispatcher.machine import Machine
+from dpdispatcher.utils.job_status import JobStatus
+from dpdispatcher.utils.utils import (
+    RetrySignal,
+    customized_script_header_template,
+    retry,
+)
+
+JH_UniScheduler_script_header_template = """\
+#!/bin/bash -l
+#JSUB -e %J.err
+#JSUB -o %J.out
+{JH_UniScheduler_nodes_line}
+{JH_UniScheduler_ptile_line}
+{JH_UniScheduler_partition_line}
+{JH_UniScheduler_number_gpu_line}"""
+
+
+class JH_UniScheduler(Machine):
+    """JH_UniScheduler batch."""
+
+    def gen_script(self, job):
+        JH_UniScheduler_script = super().gen_script(job)
+        return JH_UniScheduler_script
+
+    def gen_script_header(self, job):
+        resources = job.resources
+        script_header_dict = {
+            "JH_UniScheduler_nodes_line": f"#JSUB -n {resources.number_node * resources.cpu_per_node}",
+            "JH_UniScheduler_ptile_line": f"#JSUB -R 'span[ptile={resources.cpu_per_node}]'",
+            "JH_UniScheduler_partition_line": f"#JSUB -q {resources.queue_name}",
+        }
+        custom_gpu_line = resources.kwargs.get("custom_gpu_line", None)
+        if not custom_gpu_line:
+            script_header_dict["JH_UniScheduler_number_gpu_line"] = (
+                "" f"#JSUB -gpgpu {resources.gpu_per_node}"
+            )
+        else:
+            script_header_dict["JH_UniScheduler_number_gpu_line"] = custom_gpu_line
+        if (
+            resources["strategy"].get("customized_script_header_template_file")
+            is not None
+        ):
+            JH_UniScheduler_script_header = customized_script_header_template(
+                resources["strategy"]["customized_script_header_template_file"],
+                resources,
+            )
+        else:
+            JH_UniScheduler_script_header = (
+                JH_UniScheduler_script_header_template.format(**script_header_dict)
+            )
+
+        return JH_UniScheduler_script_header
+
+    @retry()
+    def do_submit(self, job):
+        script_file_name = job.script_file_name
+        script_str = self.gen_script(job)
+        job_id_name = job.job_hash + "_job_id"
+        self.context.write_file(fname=script_file_name, write_str=script_str)
+        script_run_str = self.gen_script_command(job)
+        script_run_file_name = f"{job.script_file_name}.run"
+        self.context.write_file(fname=script_run_file_name, write_str=script_run_str)
+
+        try:
+            stdin, stdout, stderr = self.context.block_checkcall(
+                "cd {} && {} {}".format(
+                    shlex.quote(self.context.remote_root),
+                    "jsub < ",
+                    shlex.quote(script_file_name),
+                )
+            )
+        except RuntimeError as err:
+            raise RetrySignal(err) from err
+
+        subret = stdout.readlines()
+        job_id = subret[0].split()[1][1:-1]
+        self.context.write_file(job_id_name, job_id)
+        return job_id
+
+    def default_resources(self, resources):
+        pass
+
+    @retry()
+    def check_status(self, job):
+        try:
+            job_id = job.job_id
+        except AttributeError:
+            return JobStatus.terminated
+        if job_id == "":
+            return JobStatus.unsubmitted
+        ret, stdin, stdout, stderr = self.context.block_call("jjobs " + job_id)
+        err_str = stderr.read().decode("utf-8")
+        if (f"Job <{job_id}> is not found") in err_str:
+            if self.check_finish_tag(job):
+                return JobStatus.finished
+            else:
+                return JobStatus.terminated
+        elif ret != 0:
+            # just retry when any unknown error raised.
+            raise RetrySignal(
+                "Get error code %d in checking status through ssh with job: %s . message: %s"
+                % (ret, job.job_hash, err_str)
+            )
+        status_out = stdout.read().decode("utf-8").split("\n")
+        if len(status_out) < 2:
+            return JobStatus.unknown
+        else:
+            status_line = status_out[1]
+            status_word = status_line.split()[2]
+
+        if status_word in ["PEND"]:
+            return JobStatus.waiting
+        elif status_word in ["RUN", "PSUSP", "SSUSP", "USUSP"]:
+            return JobStatus.running
+        elif status_word in ["DONE", "EXIT"]:
+            if self.check_finish_tag(job):
+                dlog.info(f"job: {job.job_hash} {job.job_id} finished")
+                return JobStatus.finished
+            else:
+                return JobStatus.terminated
+        else:
+            return JobStatus.unknown
+
+    def check_finish_tag(self, job):
+        job_tag_finished = job.job_hash + "_job_tag_finished"
+        return self.context.check_file_exists(job_tag_finished)
+
+    def resources_subfields(cls) -> List[Argument]:
+        """Generate the resources subfields.
+
+        Returns
+        -------
+        list[Argument]
+            resources subfields
+        """
+        doc_custom_gpu_line = "Custom GPU configuration, starting with #JSUB"
+
+        return [
+            Argument(
+                "kwargs",
+                dict,
+                [
+                    Argument(
+                        "custom_gpu_line",
+                        str,
+                        optional=True,
+                        default=None,
+                        doc=doc_custom_gpu_line,
+                    ),
+                ],
+                optional=False,
+                doc="Extra arguments.",
+            )
+        ]
+
+    def kill(self, job):
+        """Kill the job.
+
+        Parameters
+        ----------
+        job : Job
+            job
+        """
+        job_id = job.job_id
+        ret, stdin, stdout, stderr = self.context.block_call(
+            "jctrl kill " + str(job_id)
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
 ]
 requires-python = ">=3.7"
 readme = "README.md"
-keywords = ["dispatcher", "hpc", "slurm", "lsf", "pbs", "ssh"]
+keywords = ["dispatcher", "hpc", "slurm", "lsf", "pbs", "ssh", "jh_unischeduler"]
 
 [project.urls]
 Homepage = "https://github.com/deepmodeling/dpdispatcher"

diff --git a/tests/context.py b/tests/context.py
@@ -19,6 +19,7 @@
 from dpdispatcher.machine import Machine  # noqa: F401
 from dpdispatcher.machines.distributed_shell import DistributedShell  # noqa: F401
 from dpdispatcher.machines.dp_cloud_server import Lebesgue  # noqa: F401
+from dpdispatcher.machines.JH_UniScheduler import JH_UniScheduler  # noqa: F401
 from dpdispatcher.machines.lsf import LSF  # noqa: F401
 from dpdispatcher.machines.pbs import PBS  # noqa: F401
 from dpdispatcher.machines.shell import Shell  # noqa: F401

diff --git a/tests/devel_test_JH_UniScheduler.py b/tests/devel_test_JH_UniScheduler.py
@@ -0,0 +1,57 @@
+import json
+import os
+import sys
+
+from dpdispatcher.machine import Machine
+from dpdispatcher.submission import Resources, Submission, Task
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+# task_need_resources has no effect
+with open("jsons/machine_JH_UniScheduler.json") as f:
+    mdata = json.load(f)
+
+machine = Machine.load_from_dict(mdata["machine"])
+resources = Resources.load_from_dict(mdata["resources"])
+
+submission = Submission(
+    work_base="0_md/",
+    machine=machine,
+    resources=resources,
+    forward_common_files=["graph.pb"],
+    backward_common_files=[],
+)
+
+task1 = Task(
+    command="lmp -i input.lammps",
+    task_work_path="bct-1/",
+    forward_files=["conf.lmp", "input.lammps"],
+    backward_files=["log.lammps"],
+)
+task2 = Task(
+    command="lmp -i input.lammps",
+    task_work_path="bct-2/",
+    forward_files=["conf.lmp", "input.lammps"],
+    backward_files=["log.lammps"],
+)
+task3 = Task(
+    command="lmp -i input.lammps",
+    task_work_path="bct-3/",
+    forward_files=["conf.lmp", "input.lammps"],
+    backward_files=["log.lammps"],
+)
+task4 = Task(
+    command="lmp -i input.lammps",
+    task_work_path="bct-4/",
+    forward_files=["conf.lmp", "input.lammps"],
+    backward_files=["log.lammps"],
+)
+submission.register_task_list(
+    [
+        task1,
+        task2,
+        task3,
+        task4,
+    ]
+)
+submission.run_submission(clean=True)