In [1]:
import os
import json
import numpy as np
import pandas as pd
from ase.io import read, write
from mace.calculators import MACECalculator
from chgnet.model.model import CHGNet
from chgnet.model.dynamics import CHGNetCalculator
import torch


  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))


In [None]:
import os
import json
from ase.io import read
from chgnet.model.model import CHGNet
from chgnet.model.dynamics import CHGNetCalculator
import torch

# === Setup ===
folders = {
    "/home/phanim/harshitrawat/summer/md/mdcifs": "/home/phanim/harshitrawat/summer/final_work/mdinfo_chgnet_predictions_forces.json",
    "/home/phanim/harshitrawat/summer/md/mdcifs_strained_perturbed": "/home/phanim/harshitrawat/summer/final_work/strain_perturb_chgnet_predictions_forces.json"
}

device = torch.device("cuda:0")  # or whichever MIG slice is active

# === Load CHGNet ===
model = CHGNet.load(use_device="cpu", verbose=True)
model = model.to(device)
calc = CHGNetCalculator(model=model, use_device=device)

def extract_info_from_cif(cif_path):
    try:
        atoms = read(cif_path)
        atoms.calc = calc
        return {
            "file": os.path.basename(cif_path),
            "energy_eV": atoms.get_potential_energy(),
            "forces_per_atom_eV_per_A": atoms.get_forces().tolist(),
            "stress_tensor": atoms.get_stress(voigt=False).tolist(),
            "magmom_total": atoms.get_magnetic_moment() if "magmom" in atoms.arrays else None
        }
    except Exception as e:
        return {
            "file": os.path.basename(cif_path),
            "error": str(e)
        }

# === Label and Save ===
for folder, out_json in folders.items():
    print(f"\n📂 Labeling: {folder}")
    results = []
    cif_files = sorted([f for f in os.listdir(folder) if f.endswith(".cif")])

    for i, fname in enumerate(cif_files):
        full_path = os.path.join(folder, fname)
        result = extract_info_from_cif(full_path)
        results.append(result)
        if "error" in result:
            print(f"❌ {fname} — {result['error']}")
        else:
            print(f"✅ {i+1}/{len(cif_files)} — {fname}")

    os.makedirs(os.path.dirname(out_json), exist_ok=True)
    with open(out_json, "w") as f:
        json.dump(results, f, indent=2)

    print(f"🧾 Saved {len(results)} entries to: {out_json}")


  state = torch.load(path, map_location=torch.device("cpu"))


CHGNet v0.3.0 initialized with 412,525 parameters
CHGNet will run on cpu
CHGNet will run on cuda:0

📂 Labeling: /home/phanim/harshitrawat/summer/md/mdcifs
✅ 1/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0000.cif
✅ 2/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0001.cif
✅ 3/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0002.cif
✅ 4/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0003.cif
✅ 5/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0004.cif
✅ 6/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0005.cif
✅ 7/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0006.cif
✅ 8/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0007.cif
✅ 9/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0008.cif
✅ 10/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100_slab_heavy_T300_0009.cif
✅ 11/6030 — cellrelaxed_LLZO_001_Zr_code93_sto__Li_100

In [5]:
import os, json, numpy as np
import pandas as pd
from ase.io import read, write
from sklearn.model_selection import train_test_split

# === Paths ===
json_paths = [
    "/home/phanim/harshitrawat/summer/final_work/mdinfo_chgnet_predictions_forces.json",
    "/home/phanim/harshitrawat/summer/final_work/strain_perturb_chgnet_predictions_forces.json"
]
base_cif_dir = "/home/phanim/harshitrawat/summer/md/mdcifs"
pert_cif_dir = "/home/phanim/harshitrawat/summer/md/mdcifs_strained_perturbed"
out_folder = "/home/phanim/harshitrawat/summer/final_work"

# === Load JSON and match with CIFs ===
entries = []
for path in json_paths:
    with open(path) as f:
        entries.extend(json.load(f))

entries = [e for e in entries if "error" not in e]

# === Split into T1 and T2 ===
train_entries, val_entries = train_test_split(entries, test_size=0.1, random_state=42)

def make_extxyz(entries, outfile):
    print("extxyz process started")
    atoms_list = []
    for entry in entries:
        fname = entry["file"]
        cif_path = os.path.join(pert_cif_dir if "perturbed" in fname else base_cif_dir, fname)

        try:
            atoms = read(cif_path)
            atoms.info["REF_energy"] = entry["energy_eV"]
            atoms.arrays["REF_forces"] = np.array(entry["forces_per_atom_eV_per_A"])
            atoms.info["file"] = fname
            atoms_list.append(atoms)
        except Exception as e:
            print(f"❌ Failed on {fname}: {e}")

    write(outfile, atoms_list, format="extxyz", write_info=True)
    print(f"✅ Wrote {len(atoms_list)} to: {outfile}")



In [6]:

# === Write EXTXYZs ===
make_extxyz(train_entries, os.path.join(out_folder, "T1_chgnet_labeled.extxyz"))
make_extxyz(val_entries, os.path.join(out_folder, "T2_chgnet_labeled.extxyz"))



extxyz process started
✅ Wrote 6337 to: /home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz
extxyz process started
✅ Wrote 705 to: /home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz


In [7]:
# === Save splits as Excel ===
pd.DataFrame(train_entries).to_excel(os.path.join(out_folder, "T1_split.xlsx"), index=False)
pd.DataFrame(val_entries).to_excel(os.path.join(out_folder, "T2_split.xlsx"), index=False)

In [32]:
!bash /home/phanim/harshitrawat/summer/run_mace.sh


W0723 18:58:00.289000 794575 site-packages/torch/distributed/run.py:766] 
W0723 18:58:00.289000 794575 site-packages/torch/distributed/run.py:766] *****************************************
W0723 18:58:00.289000 794575 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0723 18:58:00.289000 794575 site-packages/torch/distributed/run.py:766] *****************************************
/home/phanim/harshitrawat/miniconda3/bin/python: can't open file '/home/phanim/harshitrawat/summer/python': [Errno 2] No such file or directory
/home/phanim/harshitrawat/miniconda3/bin/python: can't open file '/home/phanim/harshitrawat/summer/python': [Errno 2] No such file or directory
E0723 18:58:00.631000 794575 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 2) local_r

In [17]:
from mace.tools import train
from mace.modules import models
from mace.data.utils import load_dataset
from mace.tools.utils import get_default_dtype
import torch
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.set_default_dtype(torch.float32)  # force everything to float32

# === Path setup ===
train_file = "/home/phanim/harshitrawat/summer/final_work/T1_float32_fixed.extxyz"
test_file  = "/home/phanim/harshitrawat/summer/final_work/T2_float32_fixed.extxyz"
model_path = "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-64-L2_epoch-199.model"

# === Load dataset manually and cast
train_data = load_dataset(train_file, "float32")
test_data  = load_dataset(test_file,  "float32")

# === Load model
model = torch.load(model_path, map_location="cpu")
model = model.to("cuda").float()  # ✅ make model float32

# === Prepare training args
args = dict(
    model=model,
    train_dataset=train_data,
    val_dataset=test_data,
    forces_weight=100.0,
    energy_weight=1.0,
    loss="weighted",
    learning_rate=0.001,
    ema_decay=0.99,
    num_epochs=300,
    batch_size=4,
    device="cuda",
)

# === Launch training
train.train(**args)


  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))


ImportError: cannot import name 'load_dataset' from 'mace.data.utils' (/home/phanim/harshitrawat/summer/mace/mace/data/utils.py)

In [11]:
export PYTHONPATH=/home/phanim/harshitrawat/summer/mace

torchrun --nproc_per_node=2 \
         --nnodes=1 \
         --rdzv_backend=c10d \
         --rdzv_endpoint=localhost:0 \
         --master_port=29501 \
         -m mace.commands.train \
         --distributed \
         --launcher torchrun \
         --name mace_T1_finetune \
         --model MACE \
         --train_file /home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz \
         --test_file  /home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz \
         --foundation_model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
         --foundation_model_readout \
         --device cuda \
         --batch_size 2 \
         --valid_batch_size 1 \
         --default_dtype float64 \
         --valid_fraction 0.005 \
         --max_num_epochs 5 \
         --forces_weight 100.0 \
         --energy_weight 1.0 \
         --r_max 5.0 \
         --E0s "{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}"


  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
ERROR:root:Failed to initialize distributed environment: 'SLURM_NTASKS_PER_NODE'


In [4]:
!PYTHONPATH=/home/phanim/harshitrawat/mace/mace \
torchrun \
  --standalone \
  --nnodes=1 \
  --nproc_per_node=2 \
  /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
    --name='mace_T1_finetune' \
    --model='MACE' \
    --num_interactions=2 \
    --num_channels=128 \
    --max_L=2 \
    --correlation=3 \
    --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
    --r_max=5.0 \
    --train_file='/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.h5' \
    --valid_file='/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.h5' \
    --statistics_file='/home/phanim/harshitrawat/summer/final_work/statistics.json' \
    --num_workers=4 \
    --batch_size=2 \
    --valid_batch_size=1 \
    --max_num_epochs=5 \
    --loss='weighted' \
    --error_table='PerAtomRMSE' \
    --default_dtype='float64' \
    --device='cuda' \
    --distributed \
    --seed=42


W0723 23:39:42.779000 1107440 site-packages/torch/distributed/run.py:766] 
W0723 23:39:42.779000 1107440 site-packages/torch/distributed/run.py:766] *****************************************
W0723 23:39:42.779000 1107440 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0723 23:39:42.779000 1107440 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
[rank0]: Traceback (most recent call last):
[rank0]:   File [35m"/home/phanim/harshitrawat/miniconda3/lib/python3.13/site-packages/torch/cuda/__init__.py"[0m, line [35m383[0m, in [35m_lazy_init[0m
[ran

In [16]:
!export CUDA_VISIBLE_DEVICES=1,3

PYTHONPATH='/home/phanim/harshitrawat/mace/mace' \
torchrun --standalone \
         --nnodes=1 \
         --nproc_per_node=2 \
         /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --device="cuda" \
  --batch_size=2 \
  --valid_batch_size=1 \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42


SyntaxError: invalid syntax (1751150545.py, line 4)

In [22]:
import os

# Expose both MIG GPUs (1 and 3) to torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

!torchrun --standalone \
         --nnodes=1 \
         --nproc_per_node=2 \
         /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --device="cuda" \
  --batch_size=2 \
  --valid_batch_size=1 \
  --valid_fraction=0.005 \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42


W0724 00:21:12.310000 1272055 site-packages/torch/distributed/run.py:766] 
W0724 00:21:12.310000 1272055 site-packages/torch/distributed/run.py:766] *****************************************
W0724 00:21:12.310000 1272055 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0724 00:21:12.310000 1272055 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 00:21:17.805 INFO: MACE version: 0.3.14
2025-07-24 00:21:17.805 INFO: MACE version: 0.3.14
Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/miniconda3/lib/python3.13/site-

In [17]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,3"
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

!torchrun --standalone \
         --nnodes=1 \
         --nproc_per_node=2 \
         /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --device="cuda" \
  --batch_size=2 \
  --valid_batch_size=1 \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42


W0724 00:08:07.618000 1142400 site-packages/torch/distributed/run.py:766] 
W0724 00:08:07.618000 1142400 site-packages/torch/distributed/run.py:766] *****************************************
W0724 00:08:07.618000 1142400 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0724 00:08:07.618000 1142400 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 00:08:14.213 INFO: MACE version: 0.3.14
2025-07-24 00:08:14.233 INFO: MACE version: 0.3.14
Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/miniconda3/lib/python3.13/site-

In [23]:
import os

# Only expose ONE MIG device to torch
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # or "3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# PYTHONPATH optional if already in sys.path
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

!torchrun --standalone \
         --nnodes=1 \
         --nproc_per_node=1 \
         /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --device="cuda" \
  --batch_size=2 \
  --valid_batch_size=1 \
  --valid_fraction 0.005 \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42


  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 00:22:22.775 INFO: MACE version: 0.3.14
2025-07-24 00:22:23.300 INFO: CUDA version: 12.6, CUDA device: 0
  model_foundation = torch.load(
2025-07-24 00:22:23.805 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-07-24 00:22:23.806 INFO: Using heads: ['Default']
2025-07-24 00:22:23.806 INFO: Using the key specifications to parse data:
2025-07-24 00:22:23.806 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head', 'elec_temp': 'elec_temp', 'total_charge': 'total_charge', 'total_spin': 'total_spin'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-07-24 00:22:44.801 INFO: Training set 1/1 [energy: 6337, stress: 0, virials: 0, dipole components: 0, head: 6337, elec

In [2]:
import os

# Only expose ONE MIG device to torch
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # or "3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# PYTHONPATH optional if already in sys.path
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"
!torchrun --standalone \
  --nnodes=1 \
  --nproc_per_node=2 \
  /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --batch_size=1 \
  --valid_batch_size=1 \
  --valid_fraction=0.005 \
  --device="cuda" \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42 \
  --distributed


W0724 00:40:27.565000 1353925 site-packages/torch/distributed/run.py:766] 
W0724 00:40:27.565000 1353925 site-packages/torch/distributed/run.py:766] *****************************************
W0724 00:40:27.565000 1353925 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0724 00:40:27.565000 1353925 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 00:40:33.637 INFO: Process group initialized: True
2025-07-24 00:40:33.637 INFO: Processes: 1
2025-07-24 00:40:33.637 INFO: MACE version: 0.3.14
2025-07-24 00:40:33.637 INFO: CUDA version: 12.6, C

In [4]:
import os

# Only expose ONE MIG device to torch
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # or "3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# PYTHONPATH optional if already in sys.path
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"
!torchrun --standalone \
  --nnodes=1 \
  --nproc_per_node=2 \
  /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --num_interactions=1 \
  --hidden_irreps="64x0e+64x1o+64x2e" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --batch_size=1 \
  --valid_batch_size=1 \
  --valid_fraction=0.005 \
  --device="cuda" \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42 \
  --distributed


W0724 00:48:35.478000 1362799 site-packages/torch/distributed/run.py:766] 
W0724 00:48:35.478000 1362799 site-packages/torch/distributed/run.py:766] *****************************************
W0724 00:48:35.478000 1362799 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0724 00:48:35.478000 1362799 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 00:48:39.923 INFO: Process group initialized: True
2025-07-24 00:48:39.924 INFO: Processes: 1
2025-07-24 00:48:39.924 INFO: MACE version: 0.3.14
2025-07-24 00:48:39.924 INFO: CUDA version: 12.6, C

In [9]:
import os

# Only expose ONE MIG device to torch
os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-98d798d2-0f1a-500f-acdd-1a3ae2bc68d3, MIG-f3bd2aad-c585-52fe-b36a-42380cc6cfc6"  # or "3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# PYTHONPATH optional if already in sys.path
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"
!torchrun --standalone \
  --nnodes=1 \
  --nproc_per_node=2 \
  /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune" \
  --model="MACE" \
  --num_interactions=1 \
  --hidden_irreps="64x0e+64x1o+64x2e" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --batch_size=4 \
  --valid_batch_size=1 \
  --valid_fraction=0.005 \
  --device="cuda" \
  --max_num_epochs=5 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42 \
  --distributed


W0724 01:32:57.517000 1415701 site-packages/torch/distributed/run.py:766] 
W0724 01:32:57.517000 1415701 site-packages/torch/distributed/run.py:766] *****************************************
W0724 01:32:57.517000 1415701 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0724 01:32:57.517000 1415701 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 01:33:04.565 INFO: Process group initialized: True
2025-07-24 01:33:04.570 INFO: Processes: 1
2025-07-24 01:33:04.570 INFO: MACE version: 0.3.14
2025-07-24 01:33:04.571 INFO: CUDA version: 12.6, C

In [21]:
import os

# Only expose ONE MIG device to torch
os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-98d798d2-0f1a-500f-acdd-1a3ae2bc68d3, MIG-f3bd2aad-c585-52fe-b36a-42380cc6cfc6"  # or "3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# PYTHONPATH optional if already in sys.path
os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"
!torchrun --standalone \
  --nnodes=1 \
  --nproc_per_node=2 \
  /home/phanim/harshitrawat/mace/mace/cli/run_train.py \
  --name="mace_T1_finetune_a" \
  --model="MACE" \
  --num_interactions=1 \
  --foundation_model="/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model" \
  --foundation_model_readout \
  --hidden_irreps="64x0e+64x1o+64x2e" \
  --train_file="/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz" \
  --test_file="/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz" \
  --batch_size=1 \
  --valid_batch_size=1 \
  --valid_fraction=0.005 \
  --device="cuda" \
  --forces_weight=100.0 \
  --energy_weight=1.0 \
  --max_num_epochs=20 \
  --r_max=5.0 \
  --E0s="{3:-201.7093,8:-431.6112,40:-1275.9529,57:-857.6754}" \
  --seed=42 \
  --distributed


W0724 02:46:55.678000 1500316 site-packages/torch/distributed/run.py:766] 
W0724 02:46:55.678000 1500316 site-packages/torch/distributed/run.py:766] *****************************************
W0724 02:46:55.678000 1500316 site-packages/torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0724 02:46:55.678000 1500316 site-packages/torch/distributed/run.py:766] *****************************************
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))
2025-07-24 02:47:02.405 INFO: Process group initialized: True
2025-07-24 02:47:02.405 INFO: Processes: 1
2025-07-24 02:47:02.405 INFO: MACE version: 0.3.14
2025-07-24 02:47:02.405 INFO: CUDA version: 12.6, C

In [14]:
from ase.io import read
atoms = read("/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz", index=0)
print("Energy (eV):", atoms.info.get("energy"))
print("Forces (eV/Å):", atoms.get_forces())


Energy (eV): None


RuntimeError: Atoms object has no calculator.

In [15]:
from ase.io import read
from ase.io.extxyz import read_extxyz
import traceback

extxyz_path = "/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz"

try:
    atoms_list = list(read(extxyz_path, index=":"))
    print(f"✅ Successfully read {len(atoms_list)} structures")
except Exception as e:
    print("❌ Failed to read with ASE:")
    traceback.print_exc()


✅ Successfully read 6337 structures


In [23]:
import os
import torch
from mace.modules import MACE
from mace.tools.utils import read_from_json
from mace.data.utils import load_dataset
from torch.utils.data import random_split, DataLoader
from mace.training.trainer import Trainer
from mace.training.loss_functions import EnergyForcesLoss
from mace.training.utils import setup_logger

# ==== Config ====
pretrained_model_path = "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model"
train_path = "/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz"
valid_fraction = 0.005
save_dir = "mace_t1_finetuned_large"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 2
max_epochs = 20
r_max = 5.0
seed = 42

# ==== Reproducibility ====
torch.manual_seed(seed)

# ==== Load pretrained model config ====
config = read_from_json(pretrained_model_path + ".config.json")
model = MACE(**config).to(device)
model.load_state_dict(torch.load(pretrained_model_path, map_location=device))

# ==== Load dataset ====
data = load_dataset(train_path, r_max=r_max)
num_valid = max(1, int(valid_fraction * len(data)))
num_train = len(data) - num_valid
train_set, valid_set = random_split(data, [num_train, num_valid])
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=1)

# ==== Define Loss and Optimizer ====
loss_fn = EnergyForcesLoss(forces_weight=100.0, energy_weight=1.0)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# ==== Trainer Setup ====
os.makedirs(save_dir, exist_ok=True)
logger = setup_logger(save_dir)

trainer = Trainer(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    save_dir=save_dir,
    max_num_epochs=max_epochs,
    device=device,
    logger=logger
)

# ==== Train ====
trainer.train()

# ==== Save Final Model ====
torch.save(model.state_dict(), os.path.join(save_dir, "final_model.pt"))
print("✅ Training complete. Model saved at:", save_dir)


ImportError: cannot import name 'read_from_json' from 'mace.tools.utils' (/home/phanim/harshitrawat/mace/mace/tools/utils.py)

In [1]:
#!/usr/bin/env python3
import os
import subprocess
import sys
import torch


def main():
    # ——— Environment setup ———
    # expose exactly the two MIG slices you want
    # reduce fragmentation
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    # so that `mace` imports point to your local clone
    os.environ["PYTHONPATH"] = "/home/phanim/harshitrawat/mace/mace"

    # ——— Compose the torchrun command ———
    cmd = [
        "mace_run_train",
        "--name",              "mace_T1_finetune_scripted",
        "--model",             "MACE",
        "--num_interactions",  "2",
        "--foundation_model",  "/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model",
        "--foundation_model_readout",
        "--train_file",        "/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz",
        "--valid_file",        "/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz",
        "--batch_size",        "4",
        "--valid_batch_size",  "1",
        "--valid_fraction",    "0.1",
        "--ema_decay",         "0.99999",
        "--lr",                "0.0001",
        "--num_samples_pt",    "100000",
        "--forces_weight",     "10",
        "--energy_weight",     "1",
        "--device",            "cuda",
        "--loss",              "universal",
        "--max_num_epochs",    "20",
        "--r_max",             "5.0",
        "--enable_cueq",       "True",
        "--restart_latest",  # ✅ add this
        "--E0s",               "average",
        "--seed",              "42",
    ]

    # echo it so you can audit
    print("Running:", " \\\n    ".join(cmd), file=sys.stderr)
    # execute
    subprocess.run(cmd, check=True)

if __name__ == "__main__":
    main()


Running: mace_run_train \
    --name \
    mace_T1_finetune_scripted \
    --model \
    MACE \
    --num_interactions \
    2 \
    --foundation_model \
    /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model \
    --foundation_model_readout \
    --train_file \
    /home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz \
    --valid_file \
    /home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz \
    --batch_size \
    4 \
    --valid_batch_size \
    1 \
    --valid_fraction \
    0.1 \
    --ema_decay \
    0.99999 \
    --lr \
    0.0001 \
    --num_samples_pt \
    100000 \
    --forces_weight \
    10 \
    --energy_weight \
    1 \
    --device \
    cuda \
    --loss \
    universal \
    --max_num_epochs \
    20 \
    --r_max \
    5.0 \
    --enable_cueq \
    True \
    --restart_latest \
    --E0s \
    average \
    --seed \
    42
  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dir

2025-07-25 01:11:50.960 INFO: MACE version: 0.3.13
2025-07-25 01:11:51.622 INFO: CUDA version: 12.6, CUDA device: 0


  model_foundation = torch.load(


2025-07-25 01:11:52.299 INFO: Using foundation model /home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model as initial checkpoint.
2025-07-25 01:11:52.302 INFO: Using heads: ['Default']
2025-07-25 01:11:52.302 INFO: Using the key specifications to parse data:
2025-07-25 01:11:52.302 INFO: Default: KeySpecification(info_keys={'energy': 'REF_energy', 'stress': 'REF_stress', 'virials': 'REF_virials', 'dipole': 'dipole', 'head': 'head'}, arrays_keys={'forces': 'REF_forces', 'charges': 'REF_charges'})
2025-07-25 01:12:13.297 INFO: Training set 1/1 [energy: 6337, stress: 0, virials: 0, dipole components: 0, head: 6337, forces: 6337, charges: 0]
2025-07-25 01:12:13.315 INFO: Total Training set [energy: 6337, stress: 0, virials: 0, dipole components: 0, head: 6337, forces: 6337, charges: 0]
2025-07-25 01:12:15.913 INFO: Validation set 1/1 [energy: 705, stress: 0, virials: 0, dipole components: 0, head: 705, forces: 705, charges: 0]
2025-07-25 01:12:15.915

  "atomic_numbers", torch.tensor(atomic_numbers, dtype=torch.int64)


2025-07-25 01:14:01.606 INFO: Total number of parameters: 894362
2025-07-25 01:14:01.606 INFO: 
2025-07-25 01:14:01.606 INFO: Using ADAM as parameter optimizer
2025-07-25 01:14:01.606 INFO: Batch size: 4
2025-07-25 01:14:01.606 INFO: Number of gradient updates: 31685
2025-07-25 01:14:01.606 INFO: Learning rate: 0.0001, weight decay: 5e-07
2025-07-25 01:14:01.606 INFO: UniversalLoss(energy_weight=1.000, forces_weight=10.000, stress_weight=1.000)
2025-07-25 01:14:01.606 INFO: Converting model to CUEQ for accelerated training
2025-07-25 01:14:01.684 INFO: Creating new model with cuequivariance settings


Traceback (most recent call last):
  File [35m"/home/phanim/harshitrawat/miniconda3/bin/mace_run_train"[0m, line [35m8[0m, in [35m<module>[0m
    sys.exit([31mmain[0m[1;31m()[0m)
             [31m~~~~[0m[1;31m^^[0m
  File [35m"/home/phanim/harshitrawat/miniconda3/lib/python3.13/site-packages/mace/cli/run_train.py"[0m, line [35m75[0m, in [35mmain[0m
    [31mrun[0m[1;31m(args)[0m
    [31m~~~[0m[1;31m^^^^^^[0m
  File [35m"/home/phanim/harshitrawat/miniconda3/lib/python3.13/site-packages/mace/cli/run_train.py"[0m, line [35m668[0m, in [35mrun[0m
    model = run_e3nn_to_cueq(deepcopy(model), device=device)
  File [35m"/home/phanim/harshitrawat/miniconda3/lib/python3.13/site-packages/mace/cli/convert_e3nn_cueq.py"[0m, line [35m166[0m, in [35mrun[0m
    [31mtransfer_weights[0m[1;31m(source_model, target_model, max_L, correlation, num_layers)[0m
    [31m~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File

CalledProcessError: Command '['mace_run_train', '--name', 'mace_T1_finetune_scripted', '--model', 'MACE', '--num_interactions', '2', '--foundation_model', '/home/phanim/harshitrawat/summer/mace_models/universal/2024-01-07-mace-128-L2_epoch-199.model', '--foundation_model_readout', '--train_file', '/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz', '--valid_file', '/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz', '--batch_size', '4', '--valid_batch_size', '1', '--valid_fraction', '0.1', '--ema_decay', '0.99999', '--lr', '0.0001', '--num_samples_pt', '100000', '--forces_weight', '10', '--energy_weight', '1', '--device', 'cuda', '--loss', 'universal', '--max_num_epochs', '20', '--r_max', '5.0', '--enable_cueq', 'True', '--restart_latest', '--E0s', 'average', '--seed', '42']' returned non-zero exit status 1.

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))


True
1
NVIDIA H200
