In [4]:
from ase.io import read
from chgnet.graph.converter import CrystalGraphConverter

# === Load .extxyz files ===
train_extxyz = "/home/phanim/harshitrawat/summer/final_work/T1_chgnet_labeled.extxyz"
valid_extxyz = "/home/phanim/harshitrawat/summer/final_work/T2_chgnet_labeled.extxyz"

train_atoms = read(train_extxyz, index=":")
valid_atoms = read(valid_extxyz, index=":")

print(f"✅ Loaded {len(train_atoms)} training structures")
print(f"✅ Loaded {len(valid_atoms)} validation structures")

# === Initialize Converter ===
converter = CrystalGraphConverter(
    atom_graph_cutoff=6.0,
    bond_graph_cutoff=3.0,
    algorithm="fast",
    on_isolated_atoms="warn",
    verbose=True
)

✅ Loaded 6337 training structures
✅ Loaded 705 validation structures
CrystalGraphConverter(algorithm='fast', atom_graph_cutoff=6.0, bond_graph_cutoff=3.0)


AttributeError: 'CrystalGraphConverter' object has no attribute 'convert'

In [7]:
from joblib import Parallel, delayed
from pymatgen.io.ase import AseAtomsAdaptor
from chgnet.graph.converter import CrystalGraphConverter

# === Setup ===
ase_adaptor = AseAtomsAdaptor()
converter = CrystalGraphConverter(atom_graph_cutoff=6.0, bond_graph_cutoff=3.0,algorithm="fast",
    on_isolated_atoms="warn",
    verbose=True)

# === Function for one graph ===
def convert_atom(atom):
    structure = ase_adaptor.get_structure(atom)
    return converter(structure)


CrystalGraphConverter(algorithm='fast', atom_graph_cutoff=6.0, bond_graph_cutoff=3.0)


In [8]:

# === Parallel Execution ===
train_graphs = Parallel(n_jobs=32)(delayed(convert_atom)(atom) for atom in train_atoms)
valid_graphs = Parallel(n_jobs=32)(delayed(convert_atom)(atom) for atom in valid_atoms)


In [9]:
import torch

torch.save(train_graphs, "t1_chgnet_graphs.pt")
torch.save(valid_graphs, "t2_chgnet_graphs.pt")

print("✅ Saved graphs to disk.")


✅ Saved graphs to disk.


In [10]:
from torch.utils.data import Dataset, DataLoader

class GraphDataset(Dataset):
    def __init__(self, graphs):
        self.graphs = graphs

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return self.graphs[idx]

# === Wrap datasets ===
train_dataset = GraphDataset(train_graphs)
valid_dataset = GraphDataset(valid_graphs)




In [44]:
# === Create loaders ===
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

In [15]:
import torch
from chgnet.model.model import CHGNet

# === Force load on CPU first ===
device = torch.device("cuda:0")

# Load the model safely on CPU first
model = CHGNet.load(use_device="cpu", verbose=True)

# Then move manually to the MIG GPU
model = model.to(device)

# === Set to train mode ===
model.train()

# === Optional: Check model arch ===
print("✅ CHGNet loaded on", device)


CHGNet v0.3.0 initialized with 412,525 parameters
CHGNet will run on cpu
✅ CHGNet loaded on cuda:0


In [40]:
from chgnet.data.dataset import StructureData
import numpy as np
def atoms_to_structuredata(atoms_list):
    structures = []
    energies = []
    forces = []

    for atoms in atoms_list:
        structures.append(ase_adaptor.get_structure(atoms))

        energy = atoms.info.get("REF_energy", None)
        force = atoms.arrays.get("REF_forces", None)

        if energy is None or force is None:
            raise ValueError("Missing REF_energy or REF_forces")

        energies.append(energy)
        forces.append(np.array(force, dtype=np.float32))


    return StructureData(structures=structures, energies=energies, forces=forces)

train_data = atoms_to_structuredata(train_atoms)
valid_data = atoms_to_structuredata(valid_atoms)

print("✅ Created StructureData for training and validation")


StructureData imported 6,337 structures
StructureData imported 705 structures
✅ Created StructureData for training and validation


In [41]:
import torch
from ase.io import write
import joblib

# === Atoms ===
write("t1_atoms.extxyz", train_atoms)
write("t2_atoms.extxyz", valid_atoms)

# === StructureData ===
joblib.dump(train_data, "t1_structure_data.pkl")
joblib.dump(valid_data, "t2_structure_data.pkl")

print("✅ Saved graphs, atoms, and structured datasets")


✅ Saved graphs, atoms, and structured datasets


In [1]:
from chgnet.data.dataset import StructureData

# Merge two StructureData objects
full_data = StructureData(
    structures=train_data.structures + valid_data.structures,
    energies=train_data.energies + valid_data.energies,
    forces=train_data.forces + valid_data.forces,
)

train_loader, val_loader, _ = get_train_val_test_loader(
    dataset=full_data,
    batch_size=4,
    train_ratio=len(train_data) / (len(train_data) + len(valid_data)),
    val_ratio=len(valid_data) / (len(train_data) + len(valid_data)),
    num_workers=4,
    pin_memory=True
)
trainer.train(train_loader, val_loader)


NameError: name 'train_data' is not defined

In [75]:
import os
from chgnet.trainer import Trainer
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
trainer = Trainer(
    model=model,
    targets="ef",           # energy + forces
    optimizer="Adam",
    criterion="MSE",
    learning_rate=1e-4,
    epochs=10,              # you can reduce to 10 for testing
    use_device=device
)


Begin Training: using cuda:0 device
training targets: ef
Epoch: [0][1/1585] | Time (0.517)(0.334) | Loss 7803296.0000(7803296.0000) | MAE e 2787.727(2787.727)  f 0.903(0.903)  
Epoch: [0][100/1585] | Time (1.411)(1.188) | Loss 8018614.5000(7901433.4750) | MAE e 2831.121(2802.089)  f 1.391(1.161)  
Epoch: [0][200/1585] | Time (1.416)(1.193) | Loss 7980509.0000(7858899.1875) | MAE e 2819.000(2793.824)  f 1.819(1.481)  
Epoch: [0][300/1585] | Time (1.422)(1.197) | Loss 7629134.0000(7856701.1150) | MAE e 2759.278(2793.441)  f 2.367(1.764)  
Epoch: [0][400/1585] | Time (1.431)(1.205) | Loss 6995093.5000(7828456.3388) | MAE e 2633.226(2788.328)  f 3.773(2.018)  
Epoch: [0][500/1585] | Time (1.436)(1.210) | Loss 7683388.0000(7836931.0820) | MAE e 2764.686(2790.037)  f 2.207(2.228)  
Epoch: [0][600/1585] | Time (1.476)(1.252) | Loss 7284473.0000(7812712.5967) | MAE e 2694.810(2785.631)  f 3.673(2.416)  
Epoch: [0][700/1585] | Time (1.469)(1.245) | Loss 6999699.0000(7796330.0343) | MAE e 2641.7

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'chgnet.graph.crystalgraph.CrystalGraph'>

In [37]:
print(len(train_data.forces), len(train_data.structures))


<class 'numpy.ndarray'> (900, 3)
[[ 0.0230214   0.0311501  -0.34367013]
 [-0.04187635 -0.10079677 -0.40870938]
 [ 0.31012675  0.05631611 -0.21161802]
 [ 0.3888652  -0.5531863  -0.31374961]
 [ 1.6905961  -0.74202549  0.22162393]]


In [69]:
import torch
torch.cuda.empty_cache()

In [38]:
print(train_data[0])


(CrystalGraph(composition='Li840 La36 Zr24 O144', atom_graph_cutoff=6, bond_graph_cutoff=3, n_atoms=1044, atom_graph_len=109644, bond_graph_len=189152), {'e': tensor(-3105.4402), 'f': tensor([[ 0.2227,  0.8111, -0.1087],
        [-0.1613, -0.0363,  0.2550],
        [-0.1942,  0.1892, -0.2813],
        ...,
        [ 0.4923,  0.0141, -0.3031],
        [-0.6534, -0.2593,  0.5196],
        [-0.2980, -0.4079, -0.0297]])})


In [39]:
print(trainer.targets)  # should be "ef"


ef


In [None]:
trainer.save_model("T1_finetuned_chgnet.pt")
