Skip to content

_traverse_sub doesn't provide a clear error message #80

@njzjz

Description

@njzjz

When running with this input.json:

{
    "_comment": "multi-task finetuning for DPA-2.3.0-medium",
    "model": {
        "shared_dict": {
            "type_map_all": [
                "H",
                "He",
                "Li",
                "Be",
                "B",
                "C",
                "N",
                "O",
                "F",
                "Ne",
                "Na",
                "Mg",
                "Al",
                "Si",
                "P",
                "S",
                "Cl",
                "Ar",
                "K",
                "Ca",
                "Sc",
                "Ti",
                "V",
                "Cr",
                "Mn",
                "Fe",
                "Co",
                "Ni",
                "Cu",
                "Zn",
                "Ga",
                "Ge",
                "As",
                "Se",
                "Br",
                "Kr",
                "Rb",
                "Sr",
                "Y",
                "Zr",
                "Nb",
                "Mo",
                "Tc",
                "Ru",
                "Rh",
                "Pd",
                "Ag",
                "Cd",
                "In",
                "Sn",
                "Sb",
                "Te",
                "I",
                "Xe",
                "Cs",
                "Ba",
                "La",
                "Ce",
                "Pr",
                "Nd",
                "Pm",
                "Sm",
                "Eu",
                "Gd",
                "Tb",
                "Dy",
                "Ho",
                "Er",
                "Tm",
                "Yb",
                "Lu",
                "Hf",
                "Ta",
                "W",
                "Re",
                "Os",
                "Ir",
                "Pt",
                "Au",
                "Hg",
                "Tl",
                "Pb",
                "Bi",
                "Po",
                "At",
                "Rn",
                "Fr",
                "Ra",
                "Ac",
                "Th",
                "Pa",
                "U",
                "Np",
                "Pu",
                "Am",
                "Cm",
                "Bk",
                "Cf",
                "Es",
                "Fm",
                "Md",
                "No",
                "Lr",
                "Rf",
                "Db",
                "Sg",
                "Bh",
                "Hs",
                "Mt",
                "Ds",
                "Rg",
                "Cn",
                "Nh",
                "Fl",
                "Mc",
                "Lv",
                "Ts",
                "Og"
            ],
            "dpa2_descriptor": {
                "type": "dpa2",
                "repinit": {
                    "tebd_dim": 8,
                    "rcut": 6.0,
                    "rcut_smth": 0.5,
                    "nsel": 120,
                    "neuron": [
                        25,
                        50,
                        100
                    ],
                    "axis_neuron": 12,
                    "activation_function": "tanh",
                    "three_body_sel": 40,
                    "three_body_rcut": 4.0,
                    "three_body_rcut_smth": 3.5,
                    "use_three_body": true
                },
                "repformer": {
                    "rcut": 4.0,
                    "rcut_smth": 3.5,
                    "nsel": 40,
                    "nlayers": 6,
                    "g1_dim": 128,
                    "g2_dim": 32,
                    "attn2_hidden": 32,
                    "attn2_nhead": 4,
                    "attn1_hidden": 128,
                    "attn1_nhead": 4,
                    "axis_neuron": 4,
                    "update_h2": false,
                    "update_g1_has_conv": true,
                    "update_g1_has_grrg": true,
                    "update_g1_has_drrd": true,
                    "update_g1_has_attn": false,
                    "update_g2_has_g1g1": false,
                    "update_g2_has_attn": true,
                    "update_style": "res_residual",
                    "update_residual": 0.01,
                    "update_residual_init": "norm",
                    "attn2_has_gate": true,
                    "use_sqrt_nnei": true,
                    "g1_out_conv": true,
                    "g1_out_mlp": true
                },
                "add_tebd_to_repinit_out": false
            },
            "_comment": "that's all"
        },
        "model_dict": {
            "Target_FTS": {
                "finetune_head": "MP_traj_v024_alldata_mixu",
                "type_map": "type_map_all",
                "descriptor": "dpa2_descriptor",
                "fitting_net": {
                    "neuron": [
                        240,
                        240,
                        240
                    ],
                    "activation_function": "tanh",
                    "resnet_dt": true,
                    "seed": 19090,
                    "_comment": " that's all"
                }
            },
            "MP_traj_v024_alldata_mixu": {
                "finetune_head": "MP_traj_v024_alldata_mixu",
                "type_map": "type_map_all",
                "descriptor": "dpa2_descriptor",
                "fitting_net": {
                    "neuron": [
                        240,
                        240,
                        240
                    ],
                    "activation_function": "tanh",
                    "resnet_dt": true,
                    "seed": 19090,
                    "_comment": " that's all"
                }
            }
        }
    },
    "learning_rate": {
        "type": "exp",
        "decay_steps": 2000,
        "start_lr": 0.001,
        "stop_lr": 3.51e-8,
        "decay_rate": 0.98,
        "_comment": "that's all"
    },
    "loss_dict": {
        "Target_FTS": {
            "type": "ener",
            "start_pref_e": 0.02,
            "limit_pref_e": 1,
            "start_pref_f": 1000,
            "limit_pref_f": 1,
            "start_pref_v": 0,
            "limit_pref_v": 0
        },
        "MP_traj_v024_alldata_mixu": {
            "type": "ener",
            "start_pref_e": 0.02,
            "limit_pref_e": 1,
            "start_pref_f": 1000,
            "limit_pref_f": 1,
            "start_pref_v": 0.02,
            "limit_pref_v": 1
        },
        "_comment": "that's all"
    },
    "training": {
        "model_prob": {
            "Target_FTS":1.0,
            "MP_traj_v024_alldata_mixu": 1.0,
            "_comment": "Note that one train only for one head"
        },
        "data_dict":{
            "Target_FTS": {
                "stat_file": "Target_FTS.hdf5",
                "training_data": {
                    "systems": "../../data-30656",
                    "batch_size": "auto",
                    "_comment": "that's all"
                }
            },
            "MP_traj_v024_alldata_mixu": {
                "stat_file": "MPtraj.hdf5",
                "training_data": {
                    "systems": "../../../opendata/mptraj-v024",
                    "batch_size": "auto",
                    "_comment": "that's all"
                }
            }
        },
        "numb_steps": 800000,
        "warmup_steps": 8000,
        "gradient_max_norm": 5.0,
        "seed": 19090,
        "disp_file": "lcurve.out",
        "disp_freq": 1000,
        "save_freq": 10000,
        "max_ckpt_keep": 20,
        "opt_type": "Adam",
        "save_ckpt": "model.ckpt",
        "_comment": "that's all"
    }
}

and this command:

dp --pt train input.json --finetune dpa-2.3.0-m.pt

Where dpa-2.3.0-m.pt is a soft link to DPA2_medium_28_10M_beta4.pt. Error occurs:

[2024-11-07 17:09:28,686] DEEPMD INFO    installed to:          /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd
[2024-11-07 17:09:28,686] DEEPMD INFO    source:                
[2024-11-07 17:09:28,686] DEEPMD INFO    source brach:          HEAD
[2024-11-07 17:09:28,686] DEEPMD INFO    source commit:         0abb67b
[2024-11-07 17:09:28,687] DEEPMD INFO    source commit at:      2024-09-25 16:48:18 -0400
[2024-11-07 17:09:28,687] DEEPMD INFO    use float prec:        double
[2024-11-07 17:09:28,687] DEEPMD INFO    build variant:         cuda
[2024-11-07 17:09:28,687] DEEPMD INFO    Backend:               PyTorch
[2024-11-07 17:09:28,687] DEEPMD INFO    PT ver:                v2.1.2.post300-ge32f208075b
[2024-11-07 17:09:28,687] DEEPMD INFO    Enable custom OP:      True
[2024-11-07 17:09:28,687] DEEPMD INFO    build with PT ver:     2.1.2
[2024-11-07 17:09:28,687] DEEPMD INFO    build with PT inc:     /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/include
[2024-11-07 17:09:28,687] DEEPMD INFO                           /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/include/torch/csrc/api/include
[2024-11-07 17:09:28,687] DEEPMD INFO    build with PT lib:     /lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/lib
[2024-11-07 17:09:28,687] DEEPMD INFO    running on:            l12gpu01
[2024-11-07 17:09:28,687] DEEPMD INFO    computing device:      cuda:0
[2024-11-07 17:09:28,688] DEEPMD INFO    CUDA_VISIBLE_DEVICES:  0
[2024-11-07 17:09:28,688] DEEPMD INFO    Count of visible GPUs: 1
[2024-11-07 17:09:28,688] DEEPMD INFO    num_intra_threads:     0
[2024-11-07 17:09:28,688] DEEPMD INFO    num_inter_threads:     0
[2024-11-07 17:09:28,688] DEEPMD INFO    ----------------------------------------------------------------------------------------------------------------------------------------
Traceback (most recent call last):
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/bin/dp", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/main.py", line 929, in main
    deepmd_main(args)
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/pt/entrypoints/main.py", line 502, in main
    train(FLAGS)
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/pt/entrypoints/main.py", line 291, in train
    config = normalize(config, multi_task=multi_task)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/deepmd/utils/argcheck.py", line 2928, in normalize
    data = base.normalize_value(data, trim_pattern="_*")
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 589, in normalize_value
    self.traverse_value(
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 361, in traverse_value
    self._traverse_sub(
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 401, in _traverse_sub
    subarg.traverse(value, key_hook, value_hook, sub_hook, variant_hook, path)
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 343, in traverse
    self.traverse_value(
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 376, in traverse_value
    self._traverse_sub(
  File "/lustre/home/2201110432/apps/deepmd-kit/3.0.0b4/lib/python3.11/site-packages/dargs/dargs.py", line 394, in _traverse_sub
    assert isinstance(value, dict)
           ^^^^^^^^^^^^^^^^^^^^^^^
AssertionError

It there and solution ?

Originally posted by @QuantumMisaka in deepmodeling/deepmd-kit#4322

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions