# create custom dataset

In [19]:
import os
import pickle
import numpy as np
from pymatgen.core import Structure


CIF_DIR = "data/"  
OUTPUT_FILE = "data/my_custom_data.p"


dataset = {}

# 遍历CIF目录
for cif_file in os.listdir(CIF_DIR):
    if not cif_file.endswith(".cif"):
        continue
        
    try:
        # 从CIF文件加载结构
        struct = Structure.from_file(os.path.join(CIF_DIR, cif_file))
        
        # 添加到数据集（使用文件名作为材料ID）
        material_id = os.path.splitext(cif_file)[0]
        dataset[material_id] = {
            "structure": [struct],  # 单个结构
            "energy": [0.0],        # 替换为实际能量值
            "force": [np.zeros((len(struct), 3))]  # 替换为实际力矩阵
        }
        
    except Exception as e:
        print(f"处理文件 {cif_file} 时出错: {str(e)}")

# 保存数据集
if dataset:
    with open(OUTPUT_FILE, "wb") as f:
        pickle.dump(dataset, f)
    print(f"成功保存数据集到 {OUTPUT_FILE}, 包含 {len(dataset)} 个材料")
else:
    print("未找到有效的CIF文件")

成功保存数据集到 data/my_custom_data.p, 包含 119 个材料


# 训练

In [31]:
import pickle as pk
import pandas as pd
import numpy as np
import tensorflow as tf
from m3gnet.models import M3GNet, Potential
from m3gnet.trainers import PotentialTrainer
import pymatgen

print('加载数据集')#block_0，
with open('data/my_custom_data.p', 'rb') as f:
    data = pk.load(f)

# with open('data/block_1.p', 'rb') as f:
#     data2 = pk.load(f)

# data.update(data2)
print('数据集加载完成')


def get_id_train_val_test(
    total_size: int,
    split_seed: int = 42,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    test_ratio: float = 0.1,
    keep_data_order: bool = False
):
    """
    分割数据集索引为训练/验证/测试集
    """
    assert train_ratio + val_ratio + test_ratio == 1
    
    indices = np.arange(total_size)
    if not keep_data_order:
        rng = np.random.RandomState(split_seed)
        rng.shuffle(indices)
    
    train_end = int(train_ratio * total_size)
    val_end = train_end + int(val_ratio * total_size)
    
    return (
        indices[:train_end].tolist(),
        indices[train_end:val_end].tolist(),
        indices[val_end:].tolist()
    )

# 计算总结构数
total_structures = sum(len(item['energy']) for item in data.values())
id_train, id_val, id_test = get_id_train_val_test(
    total_size=total_structures,
    split_seed=42,
    train_ratio=0.90,
    val_ratio=0.05,
    test_ratio=0.05,
    keep_data_order=False,
)

# 初始化数据集列表
dataset_train = []
dataset_val = []
dataset_test = []

structure_index = 0
for key, item in data.items():
    # 处理每个结构的数据
    for iid in range(len(item['energy'])):
        if structure_index in id_train:
            target_list = dataset_train
        elif structure_index in id_val:
            target_list = dataset_val
        elif structure_index in id_test:
            target_list = dataset_test
        
        target_list.append({
            "atoms": item['structure'][iid],
            "energy": item['energy'][iid] / len(item['force'][iid]),
            "force": np.array(item['force'][iid])
        })
        
        structure_index += 1

print(f'使用 {len(dataset_train)} 个样本训练, {len(dataset_val)} 个样本验证, {len(dataset_test)} 个样本测试')

# 准备训练数据
def extract_data(dataset):
    structures = [d["atoms"] for d in dataset]
    energies = [d["energy"] for d in dataset]
    forces = [d["force"] for d in dataset]
    return structures, energies, forces

train_structures, train_energies, train_forces = extract_data(dataset_train)
val_structures, val_energies, val_forces = extract_data(dataset_val)

# 初始化模型和训练器
m3gnet = M3GNet(is_intensive=False)
potential = Potential(model=m3gnet)

trainer = PotentialTrainer(
    potential=potential,
    optimizer=tf.keras.optimizers.Adam(1e-3)
)

# 开始训练
trainer.train(
    train_structures,
    train_energies,
    train_forces,
    validation_graphs_or_structures=val_structures,
    val_energies=val_energies,
    val_forces=val_forces,
    epochs=10,
    fit_per_element_offset=True,
    save_checkpoint=True
)

print('模型训练完成，保存模型')



加载数据集
数据集加载完成
使用 107 个样本训练, 5 个样本验证, 7 个样本测试
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
模型训练完成，保存模型


# eval model

In [32]:
import pickle as pk
import numpy as np
import matplotlib.pyplot as plt
from m3gnet.models import load_model
from m3gnet.trainers import Evaluator

# 1. 加载模型和测试数据
model = load_model("callbacks/")
with open('data/test_set.p', 'rb') as f:
    test_data = pk.load(f)

# 2. 初始化评估器
evaluator = Evaluator(model)

# 3. 计算性能指标
results = evaluator.evaluate(
    test_structures=[d["atoms"] for d in test_data],
    test_energies=[d["energy"] for d in test_data],
    test_forces=[d["force"] for d in test_data]
)

# 4. 输出报告
print("=== 模型评估报告 ===")
print(f"能量平均绝对误差 (MAE): {results['energy_mae']:.4f} eV")
print(f"力分量平均绝对误差 (MAE): {results['force_mae']:.4f} eV/Å")
print(f"能量均方根误差 (RMSE): {results['energy_rmse']:.4f} eV")
print(f"力分量均方根误差 (RMSE): {results['force_rmse']:.4f} eV/Å")

# 5. 可视化误差分布
plt.figure(figsize=(10, 6))
plt.hist(results['energy_errors'], bins=50, alpha=0.7, color='blue')
plt.xlabel('能量预测误差 (eV)')
plt.ylabel('样本数量')
plt.title('能量预测误差分布')
plt.savefig('energy_error_distribution.png')
print("误差分布图已保存至 energy_error_distribution.png")


ImportError: cannot import name 'load_model' from 'm3gnet.models' (c:\Users\yyxc-one\Desktop\m3gnet-custom\m3gnet\models\__init__.py)