In [2]:
# Now universal MACE finetuning on T2
#!/usr/bin/env python3
import os
import subprocess
import sys

def main():
    # ——— Environment setup ———
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

    cmd = [
        "torchrun",
        "--standalone", 
        "--nnodes=1", 
        "--nproc_per_node=1", 
        "/home/phanim/harshitrawat/mace/mace/cli/run_train.py",
        "--name",              "mace_iteration_2_T2_iso",
        "--model",             "MACE",
        "--num_interactions",  "2",
        "--foundation_model",  "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model",
        "--foundation_model_readout",

        "--train_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/T1_it_2_isolated.extxyz",
        "--valid_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz",


        "--batch_size",        "2",
        "--valid_batch_size",  "1",
        "--valid_fraction",    "0.1",

        "--device",            "cuda",

        # === Loss function weights ===
        "--forces_weight",     "40",         # Increased force weight to balance energy better
        "--energy_weight",     "100",         # Reduced from 100 → avoid dominance + stabilize energy RMSE

        # === Learning setup ===
        "--lr",                "0.005",      # Explicit learning rate (0.0001 is too low → stagnation)
        "--scheduler_patience","4",          # Reduce LR if val loss doesn’t improve in 3 epochs
        "--clip_grad",         "10",        # Avoid exploding gradients — essential when energy_weight is high
        "--weight_decay",      "1e-8",       # Mild regularization to prevent overfitting

        # === EMA helps smooth loss curve ===
        # "--ema_decay",         "0.999",     # Smooths validation loss and helps final convergence

        # === Domain + training settings ===
        "--r_max",             "5.0",
        "--max_num_epochs",    "130",
        "--E0s",               "{3: -1.9089228666666667, 8: -4.947961005 , 40: -8.54770063 , 57: -4.936007105},    # Still allowed — could optionally be replaced by manual E0s
        "--seed",              "42",
        "--patience",     "8",

        "--restart_latest",                   # Resumes from checkpoint if available
    ]

    print("Running:", " \\\n    ".join(cmd), file=sys.stderr)
    subprocess.run(cmd, check=True)

if __name__ == "__main__":
    main()


Running: torchrun \
    --standalone \
    --nnodes=1 \
    --nproc_per_node=1 \
    /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
    --name \
    mace_iteration_2_T1 \
    --model \
    MACE \
    --num_interactions \
    2 \
    --foundation_model \
    /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
    --foundation_model_readout \
    --train_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/T1_it_2.extxyz \
    --valid_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz \
    --batch_size \
    2 \
    --valid_batch_size \
    1 \
    --valid_fraction \
    0.1 \
    --device \
    cuda \
    --forces_weight \
    40 \
    --energy_weight \
    100 \
    --lr \
    0.005 \
    --scheduler_patience \
    4 \
    --clip_grad \
    10 \
    --weight_decay \
    1e-8 \
    --r_max \
    5.0 \
    --max_num_epochs \
    130 \
    --E0s \
    average \
    --seed \
    42 \
    --patience \
    8 \
  

2025-08-16 00:20:43.261 INFO: MACE version: 0.3.14
2025-08-16 00:20:43.930 INFO: CUDA version: 12.6, CUDA device: 0


  model_foundation = torch.load(


2025-08-16 00:20:44.496 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-08-16 00:20:44.497 INFO: Using heads: ['Default']
2025-08-16 00:20:44.497 INFO: Using the key specifications to parse data:
2025-08-16 00:20:44.497 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head', 'elec_temp': 'elec_temp', 'total_charge': 'total_charge', 'total_spin': 'total_spin'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-08-16 00:21:05.876 INFO: Training set 1/1 [energy: 7537, stress: 0, virials: 0, dipole components: 0, head: 7537, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 7537, charges: 0]
2025-08-16 00:21:05.900 INFO: Total Training set [energy: 7537, stress: 0, virials: 0, dipole components: 0, head: 7537, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 7537, c



Using reduced CG: False


  torch.load(f=checkpoint_info.path, map_location=device),


2025-08-16 00:23:02.574 INFO: Total number of parameters: 894362
2025-08-16 00:23:02.574 INFO: 
2025-08-16 00:23:02.574 INFO: Using ADAM as parameter optimizer
2025-08-16 00:23:02.574 INFO: Batch size: 2
2025-08-16 00:23:02.574 INFO: Number of gradient updates: 489905
2025-08-16 00:23:02.574 INFO: Learning rate: 0.005, weight decay: 1e-08
2025-08-16 00:23:02.574 INFO: WeightedEnergyForcesLoss(energy_weight=100.000, forces_weight=40.000)
2025-08-16 00:23:02.582 INFO: Loading checkpoint: ./checkpoints/mace_iteration_2_T1_run-42_epoch-2.pt
2025-08-16 00:23:02.620 INFO: Using gradient clipping with tolerance=10.000
2025-08-16 00:23:02.621 INFO: 
2025-08-16 00:23:02.621 INFO: Started training, reporting errors on validation set
2025-08-16 00:23:02.621 INFO: Loss metrics on validation set
2025-08-16 00:26:11.823 INFO: Initial: head: Default, loss=8150.65181095, RMSE_E_per_atom= 1020.30 meV, RMSE_F= 2793.83 meV / A
2025-08-16 01:00:32.513 INFO: Epoch 2: head: Default, loss=9720.40821066, RMSE

W0816 01:38:21.530000 1098674 site-packages/torch/distributed/elastic/agent/server/api.py:719] Received 2 death signal, shutting down workers
W0816 01:38:21.533000 1098674 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 1098703 closing signal SIGINT
Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m996[0m, in [35m<module>[0m
    [31mmain[0m[1;31m()[0m
    [31m~~~~[0m[1;31m^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m77[0m, in [35mmain[0m
    [31mrun[0m[1;31m(args)[0m
    [31m~~~[0m[1;31m^^^^^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m767[0m, in [35mrun[0m
    [31mtools.train[0m[1;31m([0m
    [31m~~~~~~~~~~~[0m[1;31m^[0m
        [1;31mmodel=model,[0m
        [1;31m^^^^^^^^^^^^[0m
    ...<23 lines>...
        [1;31mrank=rank,[0m
        [1;31m^^^^^^^^^^[0m
    

KeyboardInterrupt: 

In [5]:
# Now universal MACE finetuning on T2
#!/usr/bin/env python3
import os
import subprocess
import sys

def main():
    # ——— Environment setup ———
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

    cmd = [
        "torchrun",
        "--standalone", 
        "--nnodes=1", 
        "--nproc_per_node=1", 
        "/home/phanim/harshitrawat/mace/mace/cli/run_train.py",
        "--name",              "mace_iteration_2_T1",
        "--model",             "MACE",
        "--num_interactions",  "2",
        "--foundation_model",  "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model",
        "--foundation_model_readout",

        "--train_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/T1_it_2.extxyz",
        "--valid_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz",


        "--batch_size",        "2",
        "--valid_batch_size",  "1",
        "--valid_fraction",    "0.1",

        "--device",            "cuda",

        # === Loss function weights ===
        "--forces_weight",     "40",         # Increased force weight to balance energy better
        "--energy_weight",     "100",         # Reduced from 100 → avoid dominance + stabilize energy RMSE

        # === Learning setup ===
        "--lr",                "0.005",      # Explicit learning rate (0.0001 is too low → stagnation)
        "--scheduler_patience","4",          # Reduce LR if val loss doesn’t improve in 3 epochs
        "--clip_grad",         "10",        # Avoid exploding gradients — essential when energy_weight is high
        "--weight_decay",      "1e-8",       # Mild regularization to prevent overfitting

        # === EMA helps smooth loss curve ===
        "--ema_decay",         "0.999",     # Smooths validation loss and helps final convergence

        # === Domain + training settings ===
        "--r_max",             "5.0",
        "--max_num_epochs",    "130",
        "--E0s",               "average",    # Still allowed — could optionally be replaced by manual E0s
        "--seed",              "42",
        "--patience",     "8",

        "--restart_latest",                   # Resumes from checkpoint if available
    ]

    print("Running:", " \\\n    ".join(cmd), file=sys.stderr)
    subprocess.run(cmd, check=True)

if __name__ == "__main__":
    main()


Running: torchrun \
    --standalone \
    --nnodes=1 \
    --nproc_per_node=1 \
    /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
    --name \
    mace_iteration_2_T1 \
    --model \
    MACE \
    --num_interactions \
    2 \
    --foundation_model \
    /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
    --foundation_model_readout \
    --train_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/T1_it_2_isolated.extxyz \
    --valid_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz \
    --batch_size \
    2 \
    --valid_batch_size \
    1 \
    --valid_fraction \
    0.1 \
    --device \
    cuda \
    --forces_weight \
    40 \
    --energy_weight \
    100 \
    --lr \
    0.005 \
    --scheduler_patience \
    4 \
    --clip_grad \
    10 \
    --weight_decay \
    1e-8 \
    --ema_decay \
    0.999 \
    --r_max \
    5.0 \
    --max_num_epochs \
    130 \
    --E0s \
    {3: -1.9089228666666

2025-08-15 22:18:01.046 INFO: MACE version: 0.3.14
2025-08-15 22:18:01.918 INFO: CUDA version: 12.6, CUDA device: 0


  model_foundation = torch.load(


2025-08-15 22:18:02.479 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-08-15 22:18:02.480 INFO: Using heads: ['Default']
2025-08-15 22:18:02.480 INFO: Using the key specifications to parse data:
2025-08-15 22:18:02.480 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head', 'elec_temp': 'elec_temp', 'total_charge': 'total_charge', 'total_spin': 'total_spin'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-08-15 22:18:23.787 INFO: Training set 1/1 [energy: 7541, stress: 0, virials: 0, dipole components: 0, head: 7541, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 7541, charges: 0]
2025-08-15 22:18:23.810 INFO: Total Training set [energy: 7541, stress: 0, virials: 0, dipole components: 0, head: 7541, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 7541, c



Using reduced CG: False


  torch.load(f=checkpoint_info.path, map_location=device),


2025-08-15 22:20:20.639 INFO: Total number of parameters: 894362
2025-08-15 22:20:20.639 INFO: 
2025-08-15 22:20:20.639 INFO: Using ADAM as parameter optimizer
2025-08-15 22:20:20.639 INFO: Batch size: 2
2025-08-15 22:20:20.639 INFO: Number of gradient updates: 490165
2025-08-15 22:20:20.639 INFO: Learning rate: 0.005, weight decay: 1e-08
2025-08-15 22:20:20.639 INFO: WeightedEnergyForcesLoss(energy_weight=100.000, forces_weight=40.000)
2025-08-15 22:20:20.647 INFO: Loading checkpoint: ./checkpoints/mace_iteration_2_T1_run-42_epoch-2.pt
2025-08-15 22:20:20.686 INFO: Using gradient clipping with tolerance=10.000
2025-08-15 22:20:20.686 INFO: 
2025-08-15 22:20:20.686 INFO: Started training, reporting errors on validation set
2025-08-15 22:20:20.686 INFO: Loss metrics on validation set
2025-08-15 22:23:28.508 INFO: Initial: head: Default, loss=7681.54137824, RMSE_E_per_atom=  582.12 meV, RMSE_F= 2814.56 meV / A
2025-08-15 22:57:49.468 INFO: Epoch 2: head: Default, loss=8150.65181095, RMSE

W0815 23:55:51.168000 776979 site-packages/torch/distributed/elastic/agent/server/api.py:719] Received 2 death signal, shutting down workers
W0815 23:55:51.169000 776979 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 776988 closing signal SIGINT
Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m996[0m, in [35m<module>[0m
    [31mmain[0m[1;31m()[0m
    [31m~~~~[0m[1;31m^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m77[0m, in [35mmain[0m
    [31mrun[0m[1;31m(args)[0m
    [31m~~~[0m[1;31m^^^^^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m767[0m, in [35mrun[0m
    [31mtools.train[0m[1;31m([0m
    [31m~~~~~~~~~~~[0m[1;31m^[0m
        [1;31mmodel=model,[0m
        [1;31m^^^^^^^^^^^^[0m
    ...<23 lines>...
        [1;31mrank=rank,[0m
        [1;31m^^^^^^^^^^[0m
    [1

KeyboardInterrupt: 