In [1]:
from dotenv import load_dotenv
import os

load_dotenv("/Users/yutongduan/llamp/.env.local", override=True)

print("MP key:", "SET" if (os.getenv("PMG_MAPI_KEY") or os.getenv("MP_API_KEY")) else "MISSING")
print("Google key:", "SET" if os.getenv("GOOGLE_API_KEY") else "MISSING")

MP key: SET
Google key: SET


In [2]:
from mp_api.client import MPRester
import os

key = os.getenv("PMG_MAPI_KEY") or os.getenv("MP_API_KEY")
with MPRester(key) as mpr:
    doc = mpr.materials.summary.search(material_ids=["mp-149"], fields=["band_gap"])[0]
    print("mp-149 band_gap =", doc.band_gap)

  from .autonotebook import tqdm as notebook_tqdm
Retrieving SummaryDoc documents: 100%|██████████| 1/1 [00:00<00:00, 12483.05it/s]

mp-149 band_gap = 0.6105





In [4]:
import random
import os
import itertools
from mp_api.client import MPRester

key = os.getenv("PMG_MAPI_KEY") or os.getenv("MP_API_KEY")

with MPRester(key) as mpr:
    docs_iter = mpr.materials.summary.search(
        is_stable=True,
        fields=["material_id", "band_gap"],
    )
    # 只取前 2000 条，避免拉太多
    docs = list(itertools.islice(docs_iter, 2000))

ids = [d.material_id for d in docs if (d.band_gap is not None and d.band_gap >= 0.1)]

print("downloaded docs:", len(docs))
print("candidates after band_gap filter:", len(ids))

random.seed(42)
sample_ids = random.sample(ids, 30)

sample_ids[:5], len(sample_ids)

Retrieving SummaryDoc documents: 100%|██████████| 33973/33973 [00:08<00:00, 3952.47it/s]

downloaded docs: 2000
candidates after band_gap filter: 1258





([MPID(mp-28020),
  MPID(mp-1229128),
  MPID(mp-1029616),
  MPID(mp-1203559),
  MPID(mp-23184)],
 30)

In [5]:
import pandas as pd

# 把 MPID(mp-xxxx) 转成 "mp-xxxx"
sample_ids_str = [str(x) for x in sample_ids]

df = pd.DataFrame({"material_id": sample_ids_str})
df.to_csv("materials.csv", index=False)

print("Saved materials.csv:", df.shape)
df.head()

Saved materials.csv: (30, 1)


Unnamed: 0,material_id
0,mp-28020
1,mp-1229128
2,mp-1029616
3,mp-1203559
4,mp-23184


In [6]:
import pandas as pd
import os
from mp_api.client import MPRester

props = ["band_gap", "formation_energy_per_atom", "energy_above_hull"]

df_ids = pd.read_csv("materials.csv")
material_ids = df_ids["material_id"].tolist()

key = os.getenv("PMG_MAPI_KEY") or os.getenv("MP_API_KEY")

rows = []
with MPRester(key) as mpr:
    docs = mpr.materials.summary.search(
        material_ids=material_ids,
        fields=["material_id"] + props,
    )
    for d in docs:
        rows.append({
            "material_id": str(d.material_id),
            "band_gap": d.band_gap,
            "formation_energy_per_atom": d.formation_energy_per_atom,
            "energy_above_hull": d.energy_above_hull,
        })

gt = pd.DataFrame(rows).drop_duplicates(subset=["material_id"]).sort_values("material_id")
gt.to_csv("ground_truth.csv", index=False)

print("Saved ground_truth.csv:", gt.shape)
gt.head()

Retrieving SummaryDoc documents: 100%|██████████| 30/30 [00:00<00:00, 113975.65it/s]

Saved ground_truth.csv: (30, 4)





Unnamed: 0,material_id,band_gap,formation_energy_per_atom,energy_above_hull
16,mp-1029616,2.658,-1.280364,0.0
4,mp-1029918,1.5624,-1.113593,0.0
21,mp-10336,0.5314,-1.451447,0.0
8,mp-1201606,3.0005,-2.142932,0.0
17,mp-1203559,5.8133,-0.450126,0.0


In [7]:
import pandas as pd
gt = pd.read_csv("ground_truth.csv")

print("missing counts:")
print(gt.isna().sum())

print("\nvalue ranges:")
print("band_gap min/max:", gt["band_gap"].min(), gt["band_gap"].max())
print("formation_energy_per_atom min/max:", gt["formation_energy_per_atom"].min(), gt["formation_energy_per_atom"].max())
print("energy_above_hull min/max:", gt["energy_above_hull"].min(), gt["energy_above_hull"].max())

missing counts:
material_id                  0
band_gap                     0
formation_energy_per_atom    0
energy_above_hull            0
dtype: int64

value ranges:
band_gap min/max: 0.198499999999999 6.9502
formation_energy_per_atom min/max: -4.159415700833336 -0.03840222770408
energy_above_hull min/max: 0.0 0.0


In [None]:
EVAL_PROPS = ["band_gap", "formation_energy_per_atom"]  # 先不评估 energy_above_hull