-
Notifications
You must be signed in to change notification settings - Fork 3
Closed
Description
When running with this input.json:
{
"_comment": "multi-task finetuning for DPA-2.3.0-medium",
"model": {
"shared_dict": {
"type_map_all": [
"H",
"He",
"Li",
"Be",
"B",
"C",
"N",
"O",
"F",
"Ne",
"Na",
"Mg",
"Al",
"Si",
"P",
"S",
"Cl",
"Ar",
"K",
"Ca",
"Sc",
"Ti",
"V",
"Cr",
"Mn",
"Fe",
"Co",
"Ni",
"Cu",
"Zn",
"Ga",
"Ge",
"As",
"Se",
"Br",
"Kr",
"Rb",
"Sr",
"Y",
"Zr",
"Nb",
"Mo",
"Tc",
"Ru",
"Rh",
"Pd",
"Ag",
"Cd",
"In",
"Sn",
"Sb",
"Te",
"I",
"Xe",
"Cs",
"Ba",
"La",
"Ce",
"Pr",
"Nd",
"Pm",
"Sm",
"Eu",
"Gd",
"Tb",
"Dy",
"Ho",
"Er",
"Tm",
"Yb",
"Lu",
"Hf",
"Ta",
"W",
"Re",
"Os",
"Ir",
"Pt",
"Au",
"Hg",
"Tl",
"Pb",
"Bi",
"Po",
"At",
"Rn",
"Fr",
"Ra",
"Ac",
"Th",
"Pa",
"U",
"Np",
"Pu",
"Am",
"Cm",
"Bk",
"Cf",
"Es",
"Fm",
"Md",
"No",
"Lr",
"Rf",
"Db",
"Sg",
"Bh",
"Hs",
"Mt",
"Ds",
"Rg",
"Cn",
"Nh",
"Fl",
"Mc",
"Lv",
"Ts",
"Og"
],
"dpa2_descriptor": {
"type": "dpa2",
"repinit": {
"tebd_dim": 8,
"rcut": 6.0,
"rcut_smth": 0.5,
"nsel": 120,
"neuron": [
25,
50,
100
],
"axis_neuron": 12,
"activation_function": "tanh",
"three_body_sel": 40,
"three_body_rcut": 4.0,
"three_body_rcut_smth": 3.5,
"use_three_body": true
},
"repformer": {
"rcut": 4.0,
"rcut_smth": 3.5,
"nsel": 40,
"nlayers": 6,
"g1_dim": 128,
"g2_dim": 32,
"attn2_hidden": 32,
"attn2_nhead": 4,
"attn1_hidden": 128,
"attn1_nhead": 4,
"axis_neuron": 4,
"update_h2": false,
"update_g1_has_conv": true,
"update_g1_has_grrg": true,
"update_g1_has_drrd": true,
"update_g1_has_attn": false,
"update_g2_has_g1g1": false,
"update_g2_has_attn": true,
"update_style": "res_residual",
"update_residual": 0.01,
"update_residual_init": "norm",
"attn2_has_gate": true,
"use_sqrt_nnei": true,
"g1_out_conv": true,
"g1_out_mlp": true
},
"add_tebd_to_repinit_out": false
},
"_comment": "that's all"
},
"model_dict": {
"Target_FTS": {
"finetune_head": "MP_traj_v024_alldata_mixu",
"type_map": "type_map_all",
"descriptor": "dpa2_descriptor",
"fitting_net": {
"neuron": [
240,
240,
240
],
"activation_function": "tanh",
"resnet_dt": true,
"seed": 19090,
"_comment": " that's all"
}
},
"MP_traj_v024_alldata_mixu": {
"finetune_head": "MP_traj_v024_alldata_mixu",
"type_map": "type_map_all",
"descriptor": "dpa2_descriptor",
"fitting_net": {
"neuron": [
240,
240,
240
],
"activation_function": "tanh",
"resnet_dt": true,
"seed": 19090,
"_comment": " that's all"
}
}
}
},
"learning_rate": {
"type": "exp",
"decay_steps": 2000,
"start_lr": 0.001,
"stop_lr": 3.51e-8,
"decay_rate": 0.98,
"_comment": "that's all"
},
"loss_dict": {
"Target_FTS": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 1,
"start_pref_f": 1000,
"limit_pref_f": 1,
"start_pref_v": 0,
"limit_pref_v": 0
},
"MP_traj_v024_alldata_mixu": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 1,
"start_pref_f": 1000,
"limit_pref_f": 1,
"start_pref_v": 0.02,
"limit_pref_v": 1
},
"_comment": "that's all"
},
"training": {
"model_prob": {
"Target_FTS":1.0,
"MP_traj_v024_alldata_mixu": 1.0,
"_comment": "Note that one train only for one head"
},
"data_dict":{
"Target_FTS": {
"stat_file": "Target_FTS.hdf5",
"training_data": {
"systems": "../../data-30656",
"batch_size": "auto",
"_comment": "that's all"
}
},
"MP_traj_v024_alldata_mixu": {
"stat_file": "MPtraj.hdf5",
"training_data": {
"systems": "../../../opendata/mptraj-v024",
"batch_size": "auto",
"_comment": "that's all"
}
}
},
"numb_steps": 800000,
"warmup_steps": 8000,
"gradient_max_norm": 5.0,
"seed": 19090,
"disp_file": "lcurve.out",
"disp_freq": 1000,
"save_freq": 10000,
"max_ckpt_keep": 20,
"opt_type": "Adam",
"save_ckpt": "model.ckpt",
"_comment": "that's all"
}
}
and this command:
dp --pt train input.json --finetune dpa-2.3.0-m.pt
Where dpa-2.3.0-m.pt is a soft link to DPA2_medium_28_10M_beta4.pt. Error occurs:
[2024-11-07 17:09:28,686] DEEPMD INFO installed to: /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd
[2024-11-07 17:09:28,686] DEEPMD INFO source:
[2024-11-07 17:09:28,686] DEEPMD INFO source brach: HEAD
[2024-11-07 17:09:28,686] DEEPMD INFO source commit: 0abb67b
[2024-11-07 17:09:28,687] DEEPMD INFO source commit at: 2024-09-25 16:48:18 -0400
[2024-11-07 17:09:28,687] DEEPMD INFO use float prec: double
[2024-11-07 17:09:28,687] DEEPMD INFO build variant: cuda
[2024-11-07 17:09:28,687] DEEPMD INFO Backend: PyTorch
[2024-11-07 17:09:28,687] DEEPMD INFO PT ver: v2.1.2.post300-ge32f208075b
[2024-11-07 17:09:28,687] DEEPMD INFO Enable custom OP: True
[2024-11-07 17:09:28,687] DEEPMD INFO build with PT ver: 2.1.2
[2024-11-07 17:09:28,687] DEEPMD INFO build with PT inc: /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/include
[2024-11-07 17:09:28,687] DEEPMD INFO /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/include/torch/csrc/api/include
[2024-11-07 17:09:28,687] DEEPMD INFO build with PT lib: /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/lib
[2024-11-07 17:09:28,687] DEEPMD INFO running on: l12gpu01
[2024-11-07 17:09:28,687] DEEPMD INFO computing device: cuda:0
[2024-11-07 17:09:28,688] DEEPMD INFO CUDA_VISIBLE_DEVICES: 0
[2024-11-07 17:09:28,688] DEEPMD INFO Count of visible GPUs: 1
[2024-11-07 17:09:28,688] DEEPMD INFO num_intra_threads: 0
[2024-11-07 17:09:28,688] DEEPMD INFO num_inter_threads: 0
[2024-11-07 17:09:28,688] DEEPMD INFO ----------------------------------------------------------------------------------------------------------------------------------------
Traceback (most recent call last):
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/bin/dp", line 10, in <module>
sys.exit(main())
^^^^^^
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/main.py", line 929, in main
deepmd_main(args)
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/pt/entrypoints/main.py", line 502, in main
train(FLAGS)
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/pt/entrypoints/main.py", line 291, in train
config = normalize(config, multi_task=multi_task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/utils/argcheck.py", line 2928, in normalize
data = base.normalize_value(data, trim_pattern="_*")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 589, in normalize_value
self.traverse_value(
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 361, in traverse_value
self._traverse_sub(
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 401, in _traverse_sub
subarg.traverse(value, key_hook, value_hook, sub_hook, variant_hook, path)
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 343, in traverse
self.traverse_value(
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 376, in traverse_value
self._traverse_sub(
File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 394, in _traverse_sub
assert isinstance(value, dict)
^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
It there and solution ?
Originally posted by @QuantumMisaka in deepmodeling/deepmd-kit#4322
Metadata
Metadata
Assignees
Labels
No labels