Skip to content

Commit

Permalink
add support for new dpdispatcher (#449)
Browse files Browse the repository at this point in the history
* fix show error

* use new dpdispatcher

* update new dpdispatcher

* update for new dpdispatcher

* support dpdispatcher

* fix bug when using new dpdispatcher

* update for new dpdispatcher init_bulk

* update examples for new dpdispatcher

* fix install

* fix typo use LooseVersion

Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>

* update LooseVersion

* update dependency dpdispatcher min version

* fix example

* update min dpdispatcher version

* update make_submission change trans_comm_data to forward_common_files

* delete unused compatibility code; change mdata to mdata_machine in make_submission

* update group_size for new dpdispatcher

* fix bug change indents to spaces

* update docs for new dpdispatcher

* updata args check for key local_root; update examples/CH4-refact-dpdispatcher; clean dupilicated files

* update dpdispatcher for auto_test

* update warnings for old dpdispatcher

* update docs; explain dpgen new dpdispatcher more clearly

* fix bug; LooseVersion(api_version) == "1.0" is handled now

* more clear new dpdispatcher introductions

* update README.md better docs format

* update links

* update docs for new dpdispatcher

* detailed docs about new dpdispatcher

* update docs for new dpdispatcher

* update docs about dpgen new dpdispatcher

* update docs for new dpdispatcher

* update soft link

Co-authored-by: felix5572 <yfb222333@gmail.com>
Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
  • Loading branch information
3 people committed Jul 6, 2021
1 parent e89d7fc commit 66492d1
Show file tree
Hide file tree
Showing 21 changed files with 861 additions and 318 deletions.
104 changes: 104 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1094,6 +1094,110 @@ Here `pick_data` is the data to simplify and currently only supports `MultiSyste


## Set up machine
### new dpdispatcher update note
dpdispatcher Update Note:
dpdispatcher has updated and the api of `machine.json` is changed.
dpgen will use new dpdispatcher if the key `api_version` in dpgen's `machine.json`'s value is equal or large than `1.0`.

And dpgen will use old dpdispatcher if the key `api_version` is not specified in `machine.json` or the `api_version` is smaller than `1.0`.
This gurantees that the old `machine.json`s still work.

And for now dpdispatcher is maintained on a seperate repo.
The repo link: https://github.com/deepmodeling/dpdispatcher

The api of new dpdispatcher is close to old one except for a few changes.

The new `machine.json` examples can be seen [here](https://docs.deepmodeling.org/projects/dpdispatcher/en/latest/getting-started.html)

And Here are the explanations of the keys in [machine](https://docs.deepmodeling.org/projects/dpdispatcher/en/latest/machine.html)
[resources](https://docs.deepmodeling.org/projects/dpdispatcher/en/latest/resources.html).


Here is a example `machine.json` for dpgen's new dpdispatcher.
Please check the [documents](https://deepmd.readthedocs.io/projects/dpdispatcher/en/latest/) for more information about new dpdispatcher.

an example of new dpgen's machine.json
```json
{
"api_version": "1.0",
"train": [
{
"command": "dp",
"machine": {
"batch_type": "PBS",
"context_type": "SSHContext",
"local_root": "./",
"remote_root": "/home/user1234/work_path_dpdispatcher_test",
"remote_profile": {
"hostname": "39.xxx.xx.xx",
"username": "user1234"
}
},
"resources": {
"number_node": 1,
"cpu_per_node": 4,
"gpu_per_node": 1,
"queue_name": "T4_4_15",
"group_size": 1,
"custom_flags":["#SBATCH --mem=32G"],
"strategy": {"if_cuda_multi_devices": true},
"para_deg": 3,
"source_list": ["/home/user1234/deepmd.1.2.4.env"]
}
}
],
"model_devi":[
{
"command": "lmp",
"machine":{
"batch_type": "PBS",
"context_type": "SSHContext",
"local_root": "./",
"remote_root": "/home/user1234/work_path_dpdispatcher_test",
"remote_profile": {
"hostname": "39.xxx.xx.xx",
"username": "user1234"
}
},
"resources": {
"number_node": 1,
"cpu_per_node": 4,
"gpu_per_node": 1,
"queue_name": "T4_4_15",
"group_size": 5,
"source_list": ["/home/user1234/deepmd.1.2.4.env"]
}
}
],
"fp":[
{
"command": "vasp_std",
"machine":{
"batch_type": "PBS",
"context_type": "SSHContext",
"local_root": "./",
"remote_root": "/home/user1234/work_path_dpdispatcher_test",
"remote_profile": {
"hostname": "39.xxx.xx.xx",
"username": "user1234"
}
},
"resources": {
"number_node": 1,
"cpu_per_node": 32,
"gpu_per_node": 0,
"queue_name": "G_32_128",
"group_size": 1,
"source_list": ["~/vasp.env"]
}
}
]
}
```
note1: the key "local_root" in dpgen's machine.json is always `./`

### old dpdispatcher

When switching into a new machine, you may modifying the `MACHINE`, according to the actual circumstance. Once you have finished, the `MACHINE` can be re-used for any DP-GEN tasks without any extra efforts.

An example for `MACHINE` is:
Expand Down
2 changes: 2 additions & 0 deletions dpgen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def info():
print('%10s %10s %s' % (modui, mm.__version__, mm.__path__[0]))
except ImportError:
print('%10s %10s Not Found' % (modui, ''))
except AttributeError:
print('%10s %10s unknown version or path' %(modui, ''))
print()

# reference
Expand Down
43 changes: 33 additions & 10 deletions dpgen/auto_test/common_equi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import glob
import os

import warnings
from monty.serialization import dumpfn

import dpgen.auto_test.lib.crys as crys
Expand All @@ -10,6 +10,8 @@
from dpgen.auto_test.mpdb import get_structure
from dpgen.dispatcher.Dispatcher import make_dispatcher
from dpgen.remote.decide_machine import decide_fp_machine, decide_model_devi_machine
from distutils.version import LooseVersion
from dpgen.dispatcher.Dispatcher import make_submission

lammps_task_type = ['deepmd', 'meam', 'eam_fs', 'eam_alloy']

Expand Down Expand Up @@ -149,23 +151,44 @@ def run_equi(confs,
if len(run_tasks) == 0:
return
else:
# if LooseVersion()
run_tasks = [os.path.basename(ii) for ii in all_task]
machine, resources, command, group_size = util.get_machine_info(mdata, inter_type)
print('%d tasks will be submited '%len(run_tasks))
for ii in range(len(work_path_list)):
work_path = work_path_list[ii]
disp = make_dispatcher(machine, resources, work_path, [run_tasks[ii]], group_size)
print("%s --> Runing... "%(work_path))

api_version = mdata.get('api_version', '0.9')
if LooseVersion(api_version) < LooseVersion('1.0'):
warnings.warn(f"the dpdispatcher will be updated to new version."
f"And the interface may be changed. Please check the documents for more details")
disp.run_jobs(resources,
command,
work_path,
[run_tasks[ii]],
group_size,
forward_common_files,
forward_files,
backward_files,
outlog='outlog',
errlog='errlog')
command,
work_path,
[run_tasks[ii]],
group_size,
forward_common_files,
forward_files,
backward_files,
outlog='outlog',
errlog='errlog')
elif LooseVersion(api_version) >= LooseVersion('1.0'):
submission = make_submission(
mdata_machine=machine,
mdata_resource=resources,
commands=[command],
work_path=work_path,
run_tasks=run_tasks,
group_size=group_size,
forward_common_files=forward_common_files,
forward_files=forward_files,
backward_files=backward_files,
outlog = 'outlog',
errlog = 'errlog'
)
submission.run_submission()


def post_equi(confs, inter_param):
Expand Down
25 changes: 23 additions & 2 deletions dpgen/auto_test/common_prop.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from distutils.version import LooseVersion
import glob
import os
import warnings
from multiprocessing import Pool
import dpgen.auto_test.lib.util as util
from dpgen import dlog
Expand All @@ -12,6 +14,7 @@
from dpgen.auto_test.calculator import make_calculator
from dpgen.dispatcher.Dispatcher import make_dispatcher
from dpgen.remote.decide_machine import decide_fp_machine, decide_model_devi_machine
from dpgen.dispatcher.Dispatcher import make_submission

lammps_task_type = ['deepmd', 'meam', 'eam_fs', 'eam_alloy']

Expand Down Expand Up @@ -196,7 +199,11 @@ def worker(work_path,
run_tasks = [os.path.basename(ii) for ii in all_task]
machine, resources, command, group_size = util.get_machine_info(mdata, inter_type)
disp = make_dispatcher(machine, resources, work_path, run_tasks, group_size)
disp.run_jobs(resources,
api_version = mdata.get('api_version', '0.9')
if LooseVersion(api_version) < LooseVersion('1.0'):
warnings.warn(f"the dpdispatcher will be updated to new version."
f"And the interface may be changed. Please check the documents for more details")
disp.run_jobs(resources,
command,
work_path,
run_tasks,
Expand All @@ -206,7 +213,21 @@ def worker(work_path,
backward_files,
outlog='outlog',
errlog='errlog')

elif LooseVersion(api_version) >= LooseVersion('1.0'):
submission = make_submission(
mdata_machine=machine,
mdata_resource=resources,
commands=[command],
work_path=work_path,
run_tasks=run_tasks,
group_size=group_size,
forward_common_files=forward_common_files,
forward_files=forward_files,
backward_files=backward_files,
outlog = 'outlog',
errlog = 'errlog'
)
submission.run_submission()

def post_property(confs,
# inter_param,
Expand Down
52 changes: 46 additions & 6 deletions dpgen/data/gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@
import dpgen.data.tools.bcc as bcc
import dpgen.data.tools.diamond as diamond
import dpgen.data.tools.sc as sc
from distutils.version import LooseVersion
from dpgen.generator.lib.vasp import incar_upper
from pymatgen.core import Structure
from pymatgen.io.vasp import Incar
from dpgen.remote.decide_machine import decide_fp_machine
from dpgen import ROOT_PATH
from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher
from dpgen.dispatcher.Dispatcher import Dispatcher, make_dispatcher, make_submission



Expand Down Expand Up @@ -581,9 +582,13 @@ def run_vasp_relax(jdata, mdata):
# if not _vasp_check_fin(ii):
# relax_run_tasks.append(ii)
run_tasks = [os.path.basename(ii) for ii in relax_run_tasks]
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
#dlog.info(run_tasks)
dispatcher.run_jobs(fp_resources,

api_version = mdata.get('api_version', '0.9')
if LooseVersion(api_version) < LooseVersion('1.0'):
warnings.warn(f"the dpdispatcher will be updated to new version."
f"And the interface may be changed. Please check the documents for more details")
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
dispatcher.run_jobs(fp_resources,
[fp_command],
work_dir,
run_tasks,
Expand All @@ -592,6 +597,22 @@ def run_vasp_relax(jdata, mdata):
forward_files,
backward_files)

elif LooseVersion(api_version) >= LooseVersion('1.0'):
submission = make_submission(
mdata['fp_machine'],
mdata['fp_resources'],
commands=[fp_command],
work_path=work_dir,
run_tasks=run_tasks,
group_size=fp_group_size,
forward_common_files=forward_common_files,
forward_files=forward_files,
backward_files=backward_files,
outlog = 'fp.log',
errlog = 'fp.log')
submission.run_submission()


def run_vasp_md(jdata, mdata):
fp_command = mdata['fp_command']
fp_group_size = mdata['fp_group_size']
Expand Down Expand Up @@ -627,8 +648,12 @@ def run_vasp_md(jdata, mdata):
run_tasks = [ii.replace(work_dir+"/", "") for ii in md_run_tasks]
#dlog.info("md_work_dir", work_dir)
#dlog.info("run_tasks",run_tasks)
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
dispatcher.run_jobs(fp_resources,
api_version = mdata.get('api_version', '0.9')
if LooseVersion(api_version) < LooseVersion('1.0'):
warnings.warn(f"the dpdispatcher will be updated to new version."
f"And the interface may be changed. Please check the documents for more details")
dispatcher = make_dispatcher(mdata['fp_machine'], mdata['fp_resources'], work_dir, run_tasks, fp_group_size)
dispatcher.run_jobs(fp_resources,
[fp_command],
work_dir,
run_tasks,
Expand All @@ -637,6 +662,21 @@ def run_vasp_md(jdata, mdata):
forward_files,
backward_files)

elif LooseVersion(api_version) >= LooseVersion('1.0'):
submission = make_submission(
mdata['fp_machine'],
mdata['fp_resources'],
commands=[fp_command],
work_path=work_dir,
run_tasks=run_tasks,
group_size=fp_group_size,
forward_common_files=forward_common_files,
forward_files=forward_files,
backward_files=backward_files,
outlog = 'fp.log',
errlog = 'fp.log')
submission.run_submission()

def gen_init_bulk(args) :
try:
import ruamel
Expand Down
41 changes: 41 additions & 0 deletions dpgen/dispatcher/Dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from distutils.version import LooseVersion
import os,sys,time,random,json,glob
from dpdispatcher import Task, Submission, Resources, Machine
from dpgen.dispatcher.LocalContext import LocalSession
from dpgen.dispatcher.LocalContext import LocalContext
from dpgen.dispatcher.LazyLocalContext import LazyLocalContext
Expand All @@ -12,6 +14,8 @@
from dpgen.dispatcher.JobStatus import JobStatus
from dpgen import dlog
from hashlib import sha1
# import dargs
from dargs.dargs import Argument

def _split_tasks(tasks,
group_size):
Expand Down Expand Up @@ -338,3 +342,40 @@ def make_dispatcher(mdata, mdata_resource=None, work_path=None, run_tasks=None,
context_type = 'lazy-local'
disp = Dispatcher(mdata, context_type=context_type, batch_type=batch_type)
return disp

def make_submission(mdata_machine, mdata_resources, commands, work_path, run_tasks, group_size,
forward_common_files, forward_files, backward_files, outlog, errlog):

machine = Machine.load_from_dict(mdata_machine)
resources = Resources.load_from_dict(mdata_resources)

if resources['local_root'] != './':
raise RuntimeError(f"local_root must be './' in dpgen's machine.json.")

command = "&&".join(commands)

task_list = []
for ii in run_tasks:
task = Task(
command=command,
task_work_path=ii,
forward_files=forward_files,
backward_files=backward_files,
outlog=outlog,
errlog=errlog
)
task_list.append(task)

submission = Submission(
work_base=work_path,
machine=machine,
resources=resources,
task_list=task_list,
forward_common_files=forward_common_files,
backward_common_files=[]
)
return submission




0 comments on commit 66492d1

Please sign in to comment.