In [8]:
!nvidia-smi

Tue Jul 22 18:35:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H200                    On  |   00000000:1B:00.0 Off |                   On |
| N/A   28C    P0            112W /  700W |     221MiB / 143771MiB |     N/A      Default |
|                                         |                        |              Enabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA H200                    On  |   00

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu121
True


In [1]:
import torch
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))  # Try this only if device_count > 0


1
NVIDIA H200 MIG 1g.35gb


In [3]:
import os
import torch
import json
import pandas as pd
import numpy as np
from ase.io import read
from ase.db import connect
from chgnet.model.dynamics import CHGNetCalculator
from ase.calculators.calculator import CalculatorError
print("📥 Starting... Loading Excel and preparing to write DB/XYZ")

# === Paths ===
orig_excels = [
    "/home/phanim/harshitrawat/summer/md/mdinfo_chgnet_predictions_forces.xlsx",
    "/home/phanim/harshitrawat/summer/md/strain_perturb_chgnet_predictions_forces.xlsx"
]
t2_split = "/home/phanim/harshitrawat/summer/md/T2_chgnet_labeled.xlsx"
base_dir   = "/home/phanim/harshitrawat/summer/md"
db_path    = "mace_train_T2.db"

# === Load & filter ===
df_orig = pd.concat([pd.read_excel(x) for x in orig_excels], ignore_index=True)
t2_files = set(pd.read_excel(t2_split)["file"])
df = df_orig[df_orig["file"].isin(t2_files)].reset_index(drop=True)

print(f"📂 Total T2 entries: {len(df)}")

# === Prepare lists ===
good = []   # tuples of (fname, E, F, S)
bad  = []   # list of fnames needing CHGNet

for row in df.itertuples(index=False):
    fname = row.file
    try:
        E = float(row.energy_eV)
        F = json.loads(row.forces_per_atom_eV_per_A)
        S = json.loads(row.stress_tensor)
        # quick length check
        # we’ll verify atom count later
        good.append((fname, E, F, S))
    except Exception:
        bad.append(fname)

print(f"✅ Good JSON: {len(good)}, ⚠️ Bad JSON entries: {len(bad)}")

# === Helper ===
def resolve_cif(fname):
    sub = "mdcifs_strained_perturbed" if "perturbed" in fname else "mdcifs"
    return os.path.join(base_dir, sub, fname)

# === Open DB ===
n_ok = n_fail = n_recomputed = 0
with connect(db_path, append=False) as db:

    # 1) Write all good entries
    print("\n▶️ Writing all valid entries to DB…")
    for i, (fname, E, F, S) in enumerate(good, 1):
        path = resolve_cif(fname)
        if not os.path.exists(path):
            print(f"❌ [{i}/{len(good)}] Missing CIF: {fname}")
            n_fail += 1
            continue
        try:
            atoms = read(path)
            if len(F) != len(atoms):
                raise ValueError("Atom/force count mismatch")
            atoms.info["energy"]  = E
            atoms.info["stress"]  = S
            atoms.arrays["forces"] = F
            db.write(atoms)
            n_ok += 1
            print(f"✅ [{i}/{len(good)}] {fname}")
        except Exception as e:
            print(f"❌ [{i}/{len(good)}] {fname} — {e}")
            n_fail += 1

    # 2) Recompute bad entries on GPU
    if bad:
        print(f"\n▶️ Recomputing {len(bad)} bad entries on {device}…")
        # Explicit MIG workaround
        device = torch.device("cuda:0")
        
        # Load model manually on CPU
        model = CHGNet.load(use_device="cpu", verbose=True)
        
        # Force move to device
        model = model.to(device)
        
        # Build calculator manually
        calc = CHGNetCalculator(model=model, use_device=device)

        for j, fname in enumerate(bad, 1):
            path = resolve_cif(fname)
            if not os.path.exists(path):
                print(f"❌ [{j}/{len(bad)}] Missing CIF: {fname}")
                n_fail += 1
                continue
            try:
                atoms = read(path)
                atoms.calc = calc
                E = atoms.get_potential_energy()
                F = atoms.get_forces().tolist()
                S = atoms.get_stress(voigt=False).tolist()
                if len(F) != len(atoms):
                    raise ValueError("Atom/force count mismatch")
                atoms.info["energy"]  = float(E)
                atoms.info["stress"]  = S
                atoms.arrays["forces"] = F
                db.write(atoms)
                n_recomputed += 1
                print(f"🛠 [{j}/{len(bad)}] Recomputed {fname}")
            except CalculatorError as e:
                print(f"⚠️ [{j}/{len(bad)}] CHGNet failed: {fname} — {e}")
                n_fail += 1
            except Exception as e:
                print(f"❌ [{j}/{len(bad)}] {fname} — {e}")
                n_fail += 1

print(f"\n🎉 Done — OK: {n_ok}, Recomputed: {n_recomputed}, Failed: {n_fail}")
print(f"📦 DB written to: {db_path}")


📥 Starting... Loading Excel and preparing to write DB/XYZ
📂 Total T2 entries: 3169
✅ Good JSON: 0, ⚠️ Bad JSON entries: 3169

▶️ Writing all valid entries to DB…

▶️ Recomputing 3169 bad entries on cuda…


NameError: name 'torch' is not defined

In [3]:
from chgnet.model.model import CHGNet

device = torch.device("cuda:0")

# Manually load model on CPU
model = CHGNet.load(use_device="cpu", verbose=True)

# Then manually move to your device (forces allocation)
model = model.to(device)

# Inject into calculator
calc = CHGNetCalculator(model=model, use_device=device)


CHGNet v0.3.0 initialized with 412,525 parameters
CHGNet will run on cpu
CHGNet will run on cuda:0


In [5]:
import torch
print(torch.version.cuda)  # Should match CHGNet’s requirements (e.g., 12.1)
print(torch.cuda.is_available())  # True
print(torch.cuda.device_count())  # 1
print(torch.cuda.get_device_name(0))  # Should be your MIG slice
print(torch.cuda.current_device())  # Should be 0


12.1
True
1
NVIDIA H200 MIG 1g.35gb
0


In [6]:
!export CUDA_VISIBLE_DEVICES=0
