In [1]:
import pandas as pd

# Paths to your original CHGNet prediction Excels
orig_excels = [
    "/home/phanim/harshitrawat/summer/md/mdinfo_chgnet_predictions_forces.xlsx",
    "/home/phanim/harshitrawat/summer/md/strain_perturb_chgnet_predictions_forces.xlsx"
]

# Load and concatenate
df_orig = pd.concat([pd.read_excel(x) for x in orig_excels], ignore_index=True)

# Load the T1 split to get the filenames
df_t1 = pd.read_excel("/home/phanim/harshitrawat/summer/md/T1_chgnet_labeled.xlsx")
t1_files = set(df_t1["file"].tolist())

# Filter the original DF to only T1 entries
df_pred = df_orig[df_orig["file"].isin(t1_files)].reset_index(drop=True)


In [4]:
import os
import json
import pandas as pd
import numpy as np
from ase.io import read
from ase.db import connect
from chgnet.model.dynamics import CHGNetCalculator
from ase.calculators.calculator import CalculatorError

# === Paths ===
orig_excels = [
    "/home/phanim/harshitrawat/summer/md/mdinfo_chgnet_predictions_forces.xlsx",
    "/home/phanim/harshitrawat/summer/md/strain_perturb_chgnet_predictions_forces.xlsx"
]
t1_split = "/home/phanim/harshitrawat/summer/md/T1_chgnet_labeled.xlsx"
base_dir   = "/home/phanim/harshitrawat/summer/md"
db_path    = "mace_train_T1.db"

# === Load & filter original predictions ===
df_orig = pd.concat([pd.read_excel(x) for x in orig_excels], ignore_index=True)
t1_files = set(pd.read_excel(t1_split)["file"])
df_pred  = df_orig[df_orig["file"].isin(t1_files)].reset_index(drop=True)

# === Initialize CHGNet for on-demand fixes ===
calc = CHGNetCalculator(device="cpu")
calc.model.to("cpu")

# === Helper ===
def resolve_cif(fname):
    sub = "mdcifs_strained_perturbed" if "perturbed" in fname else "mdcifs"
    return os.path.join(base_dir, sub, fname)

# === Build ASE DB ===
n_ok = n_fail = n_recomputed = 0
with connect(db_path, append=False) as db:
    for idx, row in df_pred.iterrows():
        fname = row["file"]
        path  = resolve_cif(fname)
        if not os.path.exists(path):
            print(f"❌ Missing CIF: {fname}")
            n_fail += 1
            continue

        # load structure
        atoms = read(path)
        # try parse JSON from Excel
        bad = False
        try:
            E = float(row["energy_eV"])
            F = json.loads(row["forces_per_atom_eV_per_A"])
            S = json.loads(row["stress_tensor"])
        except Exception:
            bad = True

        # fallback to CHGNet
        if bad or len(F) != len(atoms):
            n_recomputed += 1
            try:
                atoms.calc = calc
                E = atoms.get_potential_energy()
                F = atoms.get_forces().tolist()
                S = atoms.get_stress(voigt=False).tolist()
                print(f"🛠 Recomputed CHGNet for {fname}")
            except CalculatorError as e:
                print(f"⚠️ CHGNet failed on {fname}: {e}")
                n_fail += 1
                continue

        # attach and write
        atoms.info["energy"]  = float(E)
        atoms.info["stress"]  = S
        atoms.arrays["forces"] = F
        db.write(atoms)
        n_ok += 1
        print(f"✅ {fname}")

print(f"\n🎉 Done: {n_ok} OK, {n_recomputed} recomputed, {n_fail} failed → {db_path}")


KeyboardInterrupt: 

Because of this ipynb -> You never re-export the JSON through your “T1” Excel and you use the pristine JSON from the original CHGNet outputs.

No more “delimiter” errors, since the JSON strings there were written correctly by pandas.

The rest of the pipeline (path resolution, ASE→DB, MACE training) proceeds as before.