In [1]:
#!/usr/bin/env python3
import ijson, random, sys
from ase import Atoms
from ase.io import write

# --- config ---
INPATH   = "/home/phanim/harshitrawat/MPtrj_2022.9_full.json"
OUTPATH  = "/home/phanim/harshitrawat/summer/replay_LiLaZrO_5k_le200.extxyz"
WANTED   = {"Li", "La", "Zr", "O"}
MAX_AT   = 200
TARGET_N = 5000
SEED     = 42

random.seed(SEED)

def valid_entry(entry):
    try:
        syms = entry["symbols"]
        uniq = set(syms)
        if not uniq.issubset(WANTED): return False
        if not (1 <= len(uniq) <= 4): return False
        if len(syms) > MAX_AT: return False
        if "positions" not in entry or "cell" not in entry or "energy" not in entry: return False
        return True
    except Exception:
        return False

def make_atoms(entry):
    ats = Atoms(
        symbols=entry["symbols"],
        positions=entry["positions"],
        cell=entry["cell"],
        pbc=True
    )
    ats.info["energy"] = float(entry["energy"])
    return ats

def main():
    reservoir = []
    kept = 0
    seen_valid = 0

    with open(INPATH, "rb") as f:
        # stream top-level dict: material_id → dict of entries
        for mid, entries in ijson.kvitems(f, "", multiple_values=True):
            for eid, entry in entries.items():
                if not valid_entry(entry):
                    continue

                seen_valid += 1
                if kept < TARGET_N:
                    try:
                        reservoir.append(make_atoms(entry))
                        kept += 1
                    except Exception:
                        seen_valid -= 1
                        continue
                else:
                    # reservoir sampling replacement
                    j = random.randrange(seen_valid)
                    if j < TARGET_N:
                        try:
                            reservoir[j] = make_atoms(entry)
                        except Exception:
                            seen_valid -= 1
                            continue

    write(OUTPATH, reservoir)
    print(f"[done] wrote {len(reservoir)} structures to {OUTPATH}")
    print(f"[stats] seen_valid={seen_valid}, kept={kept}")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        sys.exit(130)


IncompleteJSONError: lexical error: invalid char in json text.
                                                            (right here) ------^


In [2]:
import ijson

f = "/home/phanim/harshitrawat/MPtrj_2022.9_full.json"
with open(f, "rb") as fd:
    for i, (k,v) in enumerate(ijson.kvitems(fd, "", multiple_values=True)):
        print(i, k, list(v.keys())[:5])
        if i==3: break


0 mp-1005792 ['mp-1012897-0-0', 'mp-1005792-0-1', 'mp-1005792-0-0', 'mp-1005792-1-1', 'mp-1005792-1-0']
1 mp-1006278 ['mp-1006287-0-0', 'mp-1006278-0-4', 'mp-1006278-0-3', 'mp-1006278-0-2', 'mp-1006278-0-1']
2 mp-10068 ['mp-910115-0-0', 'mp-10068-0-2', 'mp-10068-1-4', 'mp-10068-1-2', 'mp-10068-1-0']
3 mp-1007758 ['mp-1007758-0-0', 'mp-1007758-1-10', 'mp-1007758-1-9', 'mp-1007758-1-8', 'mp-1007758-1-6']


In [3]:
import sys, json, gzip
p="MPtrj_2022.9_full.json"  # change to your path
opener = gzip.open if p.endswith(".gz") else open
with opener(p, "rb") as f:
    b = f.read(2)
    f.seek(0)
    if b.startswith(b"["):
        import json
        data = json.load(f)
        print(json.dumps(data[0], ensure_ascii=False))
    else:
        # assume NDJSON
        print(f.readline().decode("utf-8").strip())


^C
