In [None]:
# Now universal MACE finetuning on T2
#!/usr/bin/env python3
import os
import subprocess
import sys

def main():
    # ——— Environment setup ———
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

    cmd = [
        "torchrun",
        "--standalone", 
        "--nnodes=1", 
        "--nproc_per_node=1", 
        "/home/phanim/harshitrawat/mace/mace/cli/run_train.py",
        "--name",              "mace_it_2_T2_universal_loss_fcut200_avg",
        "--model",             "MACE",
        "--num_interactions",  "2",
        "--foundation_model",  "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model",
        "--foundation_model_readout",

        "--train_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2_fcut_200.extxyz",
        "--valid_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz",


        "--batch_size",        "2",
        "--valid_batch_size",  "1",

        "--device",            "cuda",

        # === Loss function weights ===
        "--forces_weight",     "40",         # Increased force weight to balance energy better
        "--energy_weight",     "100",         # Reduced from 100 → avoid dominance + stabilize energy RMSE

        # === Learning setup ===
        "--lr",                "0.01",      # Explicit learning rate (0.0001 is too low → stagnation)
        "--scheduler_patience","4",          # Reduce LR if val loss doesn’t improve in 3 epochs
        "--clip_grad",         "2",        # Avoid exploding gradients — essential when energy_weight is high
        "--weight_decay",      "1e-8",       # Mild regularization to prevent overfitting
        "--loss",                "universal",
        # === EMA helps smooth loss curve ===
        "--ema_decay",         "0.999",     # Smooths validation loss and helps final convergence

        # === Domain + training settings ===
        "--r_max",             "5.0",
        "--max_num_epochs",    "130",
        "--E0s",               "average",    # Still allowed — could optionally be replaced by manual E0s
        "--seed",              "42",
        "--patience",     "8",

        "--restart_latest",                   # Resumes from checkpoint if available
    ]

    print("Running:", " \\\n    ".join(cmd), file=sys.stderr)
    subprocess.run(cmd, check=True)

if __name__ == "__main__":
    main()


Running: torchrun \
    --standalone \
    --nnodes=1 \
    --nproc_per_node=1 \
    /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
    --name \
    mace_it_2_T2_universal_loss_fcut200_avg \
    --model \
    MACE \
    --num_interactions \
    2 \
    --foundation_model \
    /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
    --foundation_model_readout \
    --train_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2_fcut_200.extxyz \
    --valid_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz \
    --batch_size \
    2 \
    --valid_batch_size \
    1 \
    --device \
    cuda \
    --forces_weight \
    40 \
    --energy_weight \
    100 \
    --lr \
    0.01 \
    --scheduler_patience \
    4 \
    --clip_grad \
    2 \
    --weight_decay \
    1e-8 \
    --loss \
    universal \
    --ema_decay \
    0.999 \
    --r_max \
    5.0 \
    --max_num_epochs \
    130 \
    --E0s \
    averag

2025-08-18 15:05:04.950 INFO: MACE version: 0.3.14
2025-08-18 15:05:05.518 INFO: CUDA version: 12.6, CUDA device: 0


  model_foundation = torch.load(


2025-08-18 15:05:05.990 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-08-18 15:05:05.991 INFO: Using heads: ['Default']
2025-08-18 15:05:05.991 INFO: Using the key specifications to parse data:
2025-08-18 15:05:05.991 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head', 'elec_temp': 'elec_temp', 'total_charge': 'total_charge', 'total_spin': 'total_spin'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-08-18 15:05:11.622 INFO: Training set 1/1 [energy: 2773, stress: 0, virials: 0, dipole components: 0, head: 2773, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 2773, charges: 0]
2025-08-18 15:05:11.629 INFO: Total Training set [energy: 2773, stress: 0, virials: 0, dipole components: 0, head: 2773, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 2773, c



Using reduced CG: False




2025-08-18 15:05:51.564 INFO: Total number of parameters: 894362
2025-08-18 15:05:51.564 INFO: 
2025-08-18 15:05:51.564 INFO: Using ADAM as parameter optimizer
2025-08-18 15:05:51.564 INFO: Batch size: 2
2025-08-18 15:05:51.564 INFO: Number of gradient updates: 180245
2025-08-18 15:05:51.564 INFO: Learning rate: 0.01, weight decay: 1e-08
2025-08-18 15:05:51.564 INFO: UniversalLoss(energy_weight=100.000, forces_weight=40.000, stress_weight=1.000)
2025-08-18 15:05:51.575 INFO: Using gradient clipping with tolerance=2.000
2025-08-18 15:05:51.575 INFO: 
2025-08-18 15:05:51.575 INFO: Started training, reporting errors on validation set
2025-08-18 15:05:51.575 INFO: Loss metrics on validation set


In [14]:
#!/usr/bin/env python3

from ase.io import read, write
import numpy as np
from tqdm import tqdm

# === Config ===
INPUT_FILE = "/home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2.extxyz"
OUTPUT_FILE = "/home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2_fcut_200.extxyz"
FORCE_CUTOFF = 200.0  # eV/Å

# === Load and filter ===
print(f"Reading frames from {INPUT_FILE}")
frames = read(INPUT_FILE, index=":")

filtered_frames = []
for atoms in tqdm(frames, desc="Filtering frames"):
    forces = atoms.arrays["REF_forces"]
    max_force = np.linalg.norm(forces, axis=1).max()
    if max_force <= FORCE_CUTOFF:
        filtered_frames.append(atoms)

print(f"Kept {len(filtered_frames)} / {len(frames)} frames with max force ≤ {FORCE_CUTOFF} eV/Å")

# === Write to file ===
write(OUTPUT_FILE, filtered_frames)
print(f"Filtered data written to {OUTPUT_FILE}")


Reading frames from /home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2.extxyz


Filtering frames: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2812/2812 [00:00<00:00, 115926.70it/s]


Kept 2773 / 2812 frames with max force ≤ 200.0 eV/Å
Filtered data written to /home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2_fcut_200.extxyz


In [8]:
# Now universal MACE finetuning on T2
#!/usr/bin/env python3
import os
import subprocess
import sys

def main():
    # ——— Environment setup ———
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

    cmd = [
        "torchrun",
        "--standalone", 
        "--nnodes=1", 
        "--nproc_per_node=1", 
        "/home/phanim/harshitrawat/mace/mace/cli/run_train.py",
        "--name",              "mace_T3_finetune_h200_cn10",
        "--model",             "MACE",
        "--num_interactions",  "2",
        "--foundation_model",  "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model",
        "--foundation_model_readout",

        "--train_file","/home/phanim/harshitrawat/summer/T2_wo_binaries_isolated/T2_wo_bin_iso.extxyz",
        "--valid_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz",


        "--batch_size",        "2",
        "--valid_batch_size",  "1",

        "--device",            "cuda",

        # === Loss function weights ===
        "--forces_weight",     "40",         # Increased force weight to balance energy better
        "--energy_weight",     "10",         # Reduced from 100 → avoid dominance + stabilize energy RMSE

        # === Learning setup ===
        "--lr",                "0.001",      # Explicit learning rate (0.0001 is too low → stagnation)
        "--scheduler_patience","4",          # Reduce LR if val loss doesn’t improve in 3 epochs
        "--clip_grad",         "2",        # Avoid exploding gradients — essential when energy_weight is high
        "--weight_decay",      "1e-8",       # Mild regularization to prevent overfitting

        # === EMA helps smooth loss curve ===
        "--ema_decay",         "0.999",     # Smooths validation loss and helps final convergence

        # === Domain + training settings ===
        "--r_max",             "5.0",
        "--max_num_epochs",    "130",
        "--E0s",               "{3: -1.9089228666666667, 8: -4.947961005 , 40: -8.54770063 , 57: -4.936007105}",    # Still allowed — could optionally be replaced by manual E0s
        "--seed",              "42",
        "--patience",     "8",

        "--restart_latest",                   # Resumes from checkpoint if available
    ]

    print("Running:", " \\\n    ".join(cmd), file=sys.stderr)
    subprocess.run(cmd, check=True)

if __name__ == "__main__":
    main()


Running: torchrun \
    --standalone \
    --nnodes=1 \
    --nproc_per_node=1 \
    /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
    --name \
    mace_T3_finetune_h200_cn10 \
    --model \
    MACE \
    --num_interactions \
    2 \
    --foundation_model \
    /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
    --foundation_model_readout \
    --train_file \
    /home/phanim/harshitrawat/summer/T2_wo_binaries_isolated/T2_wo_bin_iso.extxyz \
    --valid_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz \
    --batch_size \
    2 \
    --valid_batch_size \
    1 \
    --device \
    cuda \
    --forces_weight \
    40 \
    --energy_weight \
    10 \
    --lr \
    0.001 \
    --scheduler_patience \
    4 \
    --clip_grad \
    2 \
    --weight_decay \
    1e-8 \
    --ema_decay \
    0.999 \
    --r_max \
    5.0 \
    --max_num_epochs \
    130 \
    --E0s \
    {3: -1.9089228666666667, 8: -4.947961005 

2025-08-18 00:10:19.512 INFO: MACE version: 0.3.14
2025-08-18 00:10:20.061 INFO: CUDA version: 12.6, CUDA device: 0


  model_foundation = torch.load(


2025-08-18 00:10:20.585 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-08-18 00:10:20.586 INFO: Using heads: ['Default']
2025-08-18 00:10:20.586 INFO: Using the key specifications to parse data:
2025-08-18 00:10:20.586 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head', 'elec_temp': 'elec_temp', 'total_charge': 'total_charge', 'total_spin': 'total_spin'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-08-18 00:10:23.702 INFO: Using isolated atom energies from training file
2025-08-18 00:10:23.806 INFO: Training set 1/1 [energy: 855, stress: 0, virials: 0, dipole components: 0, head: 855, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 855, charges: 0]
2025-08-18 00:10:23.808 INFO: Total Training set [energy: 855, stress: 0, virials: 0, dipole components: 0,



Using reduced CG: False


  torch.load(f=checkpoint_info.path, map_location=device),


2025-08-18 00:10:49.641 INFO: Total number of parameters: 894362
2025-08-18 00:10:49.641 INFO: 
2025-08-18 00:10:49.641 INFO: Using ADAM as parameter optimizer
2025-08-18 00:10:49.641 INFO: Batch size: 2
2025-08-18 00:10:49.641 INFO: Number of gradient updates: 55575
2025-08-18 00:10:49.641 INFO: Learning rate: 0.001, weight decay: 1e-08
2025-08-18 00:10:49.641 INFO: WeightedEnergyForcesLoss(energy_weight=10.000, forces_weight=40.000)
2025-08-18 00:10:49.651 INFO: Loading checkpoint: ./checkpoints/mace_T3_finetune_h200_cn10_run-42_epoch-0.pt
2025-08-18 00:10:49.685 INFO: Using gradient clipping with tolerance=2.000
2025-08-18 00:10:49.685 INFO: 
2025-08-18 00:10:49.685 INFO: Started training, reporting errors on validation set
2025-08-18 00:10:49.685 INFO: Loss metrics on validation set
2025-08-18 00:13:58.331 INFO: Initial: head: Default, loss=425096977301510.43750000, RMSE_E_per_atom=171198627.24 meV, RMSE_F=840482452.54 meV / A
2025-08-18 00:21:09.725 INFO: Epoch 0: head: Default, l

W0818 00:22:33.849000 355918 site-packages/torch/distributed/elastic/agent/server/api.py:719] Received 2 death signal, shutting down workers
W0818 00:22:33.852000 355918 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 355933 closing signal SIGINT
Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m996[0m, in [35m<module>[0m
    [31mmain[0m[1;31m()[0m
    [31m~~~~[0m[1;31m^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m77[0m, in [35mmain[0m
    [31mrun[0m[1;31m(args)[0m
    [31m~~~[0m[1;31m^^^^^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m767[0m, in [35mrun[0m
    [31mtools.train[0m[1;31m([0m
    [31m~~~~~~~~~~~[0m[1;31m^[0m
        [1;31mmodel=model,[0m
        [1;31m^^^^^^^^^^^^[0m
    ...<23 lines>...
        [1;31mrank=rank,[0m
        [1;31m^^^^^^^^^^[0m
    [1

KeyboardInterrupt: 

In [5]:
# Now universal MACE finetuning on T2
#!/usr/bin/env python3
import os
import subprocess
import sys

def main():
    # ——— Environment setup ———
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

    cmd = [
        "torchrun",
        "--standalone", 
        "--nnodes=1", 
        "--nproc_per_node=1", 
        "/home/phanim/harshitrawat/mace/mace/cli/run_train.py",
        "--name",              "mace_iteration_2_T2",
        "--model",             "MACE",
        "--num_interactions",  "2",
        "--foundation_model",  "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model",
        "--foundation_model_readout",

        "--train_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2.extxyz",
        "--valid_file","/home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz",


        "--batch_size",        "2",
        "--valid_batch_size",  "1",
        "--valid_fraction",    "0.1",

        "--device",            "cuda",

        # === Loss function weights ===
        "--forces_weight",     "40",         # Increased force weight to balance energy better
        "--energy_weight",     "100",         # Reduced from 100 → avoid dominance + stabilize energy RMSE

        # === Learning setup ===
        "--lr",                "0.005",      # Explicit learning rate (0.0001 is too low → stagnation)
        "--scheduler_patience","4",          # Reduce LR if val loss doesn’t improve in 3 epochs
        "--clip_grad",         "10",        # Avoid exploding gradients — essential when energy_weight is high
        "--weight_decay",      "1e-8",       # Mild regularization to prevent overfitting

        # === EMA helps smooth loss curve ===
        "--ema_decay",         "0.999",     # Smooths validation loss and helps final convergence

        # === Domain + training settings ===
        "--r_max",             "5.0",
        "--max_num_epochs",    "130",
        "--E0s",               "average",    # Still allowed — could optionally be replaced by manual E0s
        "--seed",              "42",
        "--patience",     "8",

        "--restart_latest",                   # Resumes from checkpoint if available
    ]

    print("Running:", " \\\n    ".join(cmd), file=sys.stderr)
    subprocess.run(cmd, check=True)

if __name__ == "__main__":
    main()


Running: torchrun \
    --standalone \
    --nnodes=1 \
    --nproc_per_node=1 \
    /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
    --name \
    mace_iteration_2_T2 \
    --model \
    MACE \
    --num_interactions \
    2 \
    --foundation_model \
    /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
    --foundation_model_readout \
    --train_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/T2_it_2.extxyz \
    --valid_file \
    /home/phanim/harshitrawat/summer/T1_T2_T3_data/val_it2.extxyz \
    --batch_size \
    2 \
    --valid_batch_size \
    1 \
    --valid_fraction \
    0.1 \
    --device \
    cuda \
    --forces_weight \
    40 \
    --energy_weight \
    100 \
    --lr \
    0.005 \
    --scheduler_patience \
    4 \
    --clip_grad \
    10 \
    --weight_decay \
    1e-8 \
    --ema_decay \
    0.999 \
    --r_max \
    5.0 \
    --max_num_epochs \
    130 \
    --E0s \
    average \
    --seed \
    42

2025-08-15 23:56:00.517 INFO: MACE version: 0.3.14
2025-08-15 23:56:01.061 INFO: CUDA version: 12.6, CUDA device: 0


  model_foundation = torch.load(


2025-08-15 23:56:01.677 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-08-15 23:56:01.679 INFO: Using heads: ['Default']
2025-08-15 23:56:01.679 INFO: Using the key specifications to parse data:
2025-08-15 23:56:01.679 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head', 'elec_temp': 'elec_temp', 'total_charge': 'total_charge', 'total_spin': 'total_spin'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-08-15 23:56:07.316 INFO: Training set 1/1 [energy: 2812, stress: 0, virials: 0, dipole components: 0, head: 2812, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 2812, charges: 0]
2025-08-15 23:56:07.324 INFO: Total Training set [energy: 2812, stress: 0, virials: 0, dipole components: 0, head: 2812, elec_temp: 0, total_charge: 0, total_spin: 0, forces: 2812, c



Using reduced CG: False


  torch.load(f=checkpoint_info.path, map_location=device),


2025-08-15 23:56:45.620 INFO: Total number of parameters: 894362
2025-08-15 23:56:45.620 INFO: 
2025-08-15 23:56:45.620 INFO: Using ADAM as parameter optimizer
2025-08-15 23:56:45.620 INFO: Batch size: 2
2025-08-15 23:56:45.620 INFO: Number of gradient updates: 182780
2025-08-15 23:56:45.620 INFO: Learning rate: 0.005, weight decay: 1e-08
2025-08-15 23:56:45.620 INFO: WeightedEnergyForcesLoss(energy_weight=100.000, forces_weight=40.000)
2025-08-15 23:56:45.653 INFO: Loading checkpoint: ./checkpoints/mace_iteration_2_T2_run-42_epoch-21.pt
2025-08-15 23:56:45.687 INFO: Using gradient clipping with tolerance=10.000
2025-08-15 23:56:45.688 INFO: 
2025-08-15 23:56:45.688 INFO: Started training, reporting errors on validation set
2025-08-15 23:56:45.688 INFO: Loss metrics on validation set
2025-08-15 23:59:55.084 INFO: Initial: head: Default, loss=5100.22512230, RMSE_E_per_atom=  540.85 meV, RMSE_F= 2268.14 meV / A
2025-08-16 00:11:02.802 INFO: Epoch 21: head: Default, loss=5386.33421397, RM

W0816 00:15:59.898000 952576 site-packages/torch/distributed/elastic/agent/server/api.py:719] Received 2 death signal, shutting down workers
W0816 00:15:59.899000 952576 site-packages/torch/distributed/elastic/multiprocessing/api.py:900] Sending process 952732 closing signal SIGINT
Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m996[0m, in [35m<module>[0m
    [31mmain[0m[1;31m()[0m
    [31m~~~~[0m[1;31m^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m77[0m, in [35mmain[0m
    [31mrun[0m[1;31m(args)[0m
    [31m~~~[0m[1;31m^^^^^^[0m
  File [35m"/home/phanim/harshitrawat/mace/mace/cli/run_train.py"[0m, line [35m767[0m, in [35mrun[0m
    [31mtools.train[0m[1;31m([0m
    [31m~~~~~~~~~~~[0m[1;31m^[0m
        [1;31mmodel=model,[0m
        [1;31m^^^^^^^^^^^^[0m
    ...<23 lines>...
        [1;31mrank=rank,[0m
        [1;31m^^^^^^^^^^[0m
    [1

KeyboardInterrupt: 