In [1]:
# ===== Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import json
import numpy as np
import pandas as pd

from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import joblib

# ---- 全局随机种子 ----
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用设备:", device)


def compute_regression_metrics(y_true, y_pred):
    """计算 MAE / RMSE / R2 / Pearson_r"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))

    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        pr = float("nan")
    else:
        pr, _ = pearsonr(y_true, y_pred)
        pr = float(pr)

    return {"MAE": mae, "RMSE": rmse, "R2": r2, "Pearson_r": pr}


def np_encoder(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")


使用设备: cuda


In [2]:
# ===== Cell 2: 路径 & 读取 df =====
ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# Graph 嵌入 + row_id
GRAPH_DIR        = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_DIR / "reg_graph_embeddings.npy"
GRAPH_ROWID_PATH = GRAPH_DIR / "row_id_graph_for_emb.npy"

# PhysChem 嵌入 + row_id
PHY_DIR        = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_DIR / "emb_physchem_mlp_all.npy"
PHY_ROWID_PATH = PHY_DIR / "row_id_clean.npy"

# 原始数据
DATA_PATH = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")

# 输出目录
MID_ROOT   = ROOT_MULTI / "mid(G+P)"
OUT_GP     = MID_ROOT / "graph_physchem_meta"
MODELS_CA  = OUT_GP / "models_crossattn"
MODELS_RF  = OUT_GP / "models_rf"
for d in [MID_ROOT, OUT_GP, MODELS_CA, MODELS_RF]:
    d.mkdir(parents=True, exist_ok=True)

SMILES_COL   = "SMILES_Canonical_RDKit"
DURATION_COL = "Duration_Value(hour)"
EFFECT_COL   = "Effect"
ENDPOINT_COL = "Endpoint"
LABEL_RAW    = "mgperL"
LABEL_LOG    = "mgperL_log"
LABEL_COL    = LABEL_LOG

print("GRAPH_EMB_PATH:", GRAPH_EMB_PATH)
print("PHY_EMB_PATH  :", PHY_EMB_PATH)
print("DATA_PATH     :", DATA_PATH)
print("OUT_GP        :", OUT_GP)

df = pd.read_excel(DATA_PATH, engine="openpyxl")
if "row_id" not in df.columns:
    df = df.reset_index().rename(columns={"index": "row_id"})
df["row_id"] = df["row_id"].astype(int)

if LABEL_LOG not in df.columns:
    df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")
    mask_valid = df[LABEL_RAW] > 0
    df[LABEL_LOG] = np.where(mask_valid, np.log10(df[LABEL_RAW]), np.nan)

print("df 形状:", df.shape)
print(df[["row_id", SMILES_COL, LABEL_RAW, LABEL_LOG]].head())


GRAPH_EMB_PATH: /root/Invertebrates_EC50_multi_fusion/graph/graph_outputs/reg_graph_embeddings.npy
PHY_EMB_PATH  : /root/Invertebrates_EC50_multi_fusion/phychem/physchem_mlp_rf_v2/emb_physchem_mlp_all.npy
DATA_PATH     : /root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx
OUT_GP        : /root/Invertebrates_EC50_multi_fusion/mid(G+P)/graph_physchem_meta
df 形状: (3620, 36)
   row_id    SMILES_Canonical_RDKit  mgperL  mgperL_log
0       0        [Cl-].[Cl-].[Zn+2]     1.3    0.113943
1       1  O=S(=O)([O-])[O-].[Zn+2]     2.5    0.397940
2       2        [Cl-].[Cl-].[Pb+2]    40.8    1.610660
3       3  O=S(=O)([O-])[O-].[Cu+2]     1.9    0.278754
4       4  O=S(=O)([O-])[O-].[Cu+2]     0.6   -0.221849


In [3]:
# ===== Cell 3: 加载 Graph & PhysChem，对齐 + 构造 meta =====

graph_emb   = np.load(GRAPH_EMB_PATH)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)
print("graph_emb 形状:", graph_emb.shape)
print("rowid_graph 范围:", rowid_graph.min(), "→", rowid_graph.max())

phys_emb   = np.load(PHY_EMB_PATH)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)
print("phys_emb 形状:", phys_emb.shape)
print("rowid_phys 范围:", rowid_phys.min(), "→", rowid_phys.max())

df_indexed = df.set_index("row_id")

ids_graph = set(rowid_graph.tolist())
ids_phys  = set(rowid_phys.tolist())
ids_all   = set(df_indexed.index.tolist())

ids_inter = sorted(list(ids_graph & ids_phys & ids_all))
print("Graph+Phys row_id 交集初始样本数:", len(ids_inter))

idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}
idx_map_phys  = {rid: i for i, rid in enumerate(rowid_phys)}

meta_list    = []
X_graph_list = []
X_phys_list  = []
y_list       = []
rid_list     = []

for rid in ids_inter:
    row_meta = df_indexed.loc[rid]
    label = row_meta[LABEL_COL]
    if pd.isna(label) or not np.isfinite(label):
        continue
    if pd.isna(row_meta.get(DURATION_COL)) or pd.isna(row_meta.get(EFFECT_COL)) or pd.isna(row_meta.get(ENDPOINT_COL)):
        continue

    meta_list.append(row_meta)
    X_graph_list.append(graph_emb[idx_map_graph[rid]])
    X_phys_list.append(phys_emb[idx_map_phys[rid]])
    y_list.append(label)
    rid_list.append(rid)

meta_gp  = pd.DataFrame(meta_list).reset_index(drop=True)
X_graph  = np.stack(X_graph_list, axis=0)
X_phys   = np.stack(X_phys_list, axis=0)
y_all    = np.array(y_list, dtype=float)
rowid_all= np.array(rid_list, dtype=int)

print("过滤后样本数:", len(y_all))
print("X_graph 形状:", X_graph.shape)
print("X_phys  形状:", X_phys.shape)

# Duration
meta_gp[DURATION_COL] = pd.to_numeric(meta_gp[DURATION_COL], errors="coerce")
dur_median = meta_gp[DURATION_COL].median()
meta_gp[DURATION_COL] = meta_gp[DURATION_COL].fillna(dur_median)
dur_raw = meta_gp[[DURATION_COL]].values.astype(float)

# One-hot: Effect + Endpoint
cat_cols = [EFFECT_COL, ENDPOINT_COL]
cat_dummies = pd.get_dummies(meta_gp[cat_cols], dummy_na=False)
cat_all = cat_dummies.values.astype(float)
cat_feature_names = list(cat_dummies.columns)

print("dur_raw 形状:", dur_raw.shape)
print("cat_all 形状:", cat_all.shape)

groups_all = meta_gp[SMILES_COL].astype(str).values
print("总样本数 N:", len(y_all))


graph_emb 形状: (3213, 256)
rowid_graph 范围: 1 → 3619
phys_emb 形状: (3406, 64)
rowid_phys 范围: 0 → 3619
Graph+Phys row_id 交集初始样本数: 3103
过滤后样本数: 3103
X_graph 形状: (3103, 256)
X_phys  形状: (3103, 64)
dur_raw 形状: (3103, 1)
cat_all 形状: (3103, 3)
总样本数 N: 3103


In [4]:
# ===== Cell 4: 原始数值块 =====
X_graph_raw = X_graph
X_phys_raw  = X_phys

print("X_graph_raw 形状:", X_graph_raw.shape)
print("X_phys_raw  形状:", X_phys_raw.shape)
print("dur_raw     形状:", dur_raw.shape)


X_graph_raw 形状: (3103, 256)
X_phys_raw  形状: (3103, 64)
dur_raw     形状: (3103, 1)


In [5]:
# ===== Cell 5: Dataset & Cross-Attn 模型（Graph+Phys） =====

class PairDataset(Dataset):
    def __init__(self, X1, X2, y):
        self.X1 = torch.from_numpy(X1).float()
        self.X2 = torch.from_numpy(X2).float()
        self.y  = torch.from_numpy(y).float()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]


class CrossAttnEncoder(nn.Module):
    def __init__(self, dim_a, dim_b, hidden_dim=256, num_heads=4, dropout=0.1):
        super().__init__()
        self.proj_a = nn.Linear(dim_a, hidden_dim)
        self.proj_b = nn.Linear(dim_b, hidden_dim)
        self.attn = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=False,
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, xa, xb):
        h_a = self.proj_a(xa)
        h_b = self.proj_b(xb)
        tokens = torch.stack([h_a, h_b], dim=0)
        attn_out, _ = self.attn(tokens, tokens, tokens)
        fused = attn_out.mean(dim=0)
        return self.dropout(fused)


class CrossAttnWithHead(nn.Module):
    def __init__(self, dim_a, dim_b,
                 hidden_dim=256,
                 num_heads=4,
                 mlp_hidden=512,
                 dropout=0.1):
        super().__init__()
        self.encoder = CrossAttnEncoder(dim_a, dim_b, hidden_dim, num_heads, dropout)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, mlp_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, 1),
        )

    def forward(self, xa, xb):
        fused = self.encoder(xa, xb)
        out   = self.head(fused).squeeze(-1)
        return out, fused


In [6]:
# ===== Cell 6: Cross-Attn 按 SMILES 8:2 划分 & 训练（Graph+Phys） =====

gss_ca = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=4025)
N = len(y_all)
idx_all = np.arange(N)

ca_train_idx, ca_val_idx = next(
    gss_ca.split(np.zeros(N), y_all, groups_all)
)
ca_train_idx = np.array(ca_train_idx, dtype=np.int64)
ca_val_idx   = np.array(ca_val_idx, dtype=np.int64)

print("Cross-Attn G+P train 样本数:", len(ca_train_idx))
print("Cross-Attn G+P val   样本数:", len(ca_val_idx))

np.save(OUT_GP / "ca_train_idx_G_P.npy", ca_train_idx)
np.save(OUT_GP / "ca_val_idx_G_P.npy",   ca_val_idx)

# 标准化（仅用 Cross-Attn train80% 拟合）
scaler_graph_ca = StandardScaler().fit(X_graph_raw[ca_train_idx])
scaler_phys_ca  = StandardScaler().fit(X_phys_raw[ca_train_idx])

X_graph_ca_all_std = scaler_graph_ca.transform(X_graph_raw)
X_phys_ca_all_std  = scaler_phys_ca.transform(X_phys_raw)

X_graph_ca_train = X_graph_ca_all_std[ca_train_idx]
X_phys_ca_train  = X_phys_ca_all_std[ca_train_idx]
y_ca_train       = y_all[ca_train_idx]

X_graph_ca_val = X_graph_ca_all_std[ca_val_idx]
X_phys_ca_val  = X_phys_ca_all_std[ca_val_idx]
y_ca_val       = y_all[ca_val_idx]

batch_size = 64
ds_ca_tr  = PairDataset(X_graph_ca_train, X_phys_ca_train, y_ca_train)
ds_ca_val = PairDataset(X_graph_ca_val,   X_phys_ca_val,   y_ca_val)

dl_ca_tr  = DataLoader(ds_ca_tr,  batch_size=batch_size, shuffle=True,  drop_last=False)
dl_ca_val = DataLoader(ds_ca_val, batch_size=batch_size, shuffle=False, drop_last=False)

dim_a = X_graph_ca_train.shape[1]
dim_b = X_phys_ca_train.shape[1]

model_ca = CrossAttnWithHead(
    dim_a=dim_a,
    dim_b=dim_b,
    hidden_dim=256,
    num_heads=4,
    mlp_hidden=512,
    dropout=0.1,
).to(device)

loss_fn = nn.L1Loss()
optimizer = torch.optim.AdamW(
    model_ca.parameters(),
    lr=5e-4,
    weight_decay=1e-4,
)

max_epochs = 80
patience   = 10
best_val_mae = float("inf")
best_state_dict = None
best_epoch = -1
epochs_no_improve = 0

history_ca = {"train_mae": [], "val_mae": []}

for epoch in range(1, max_epochs + 1):
    model_ca.train()
    train_abs_err = []

    for X1_b, X2_b, y_b in dl_ca_tr:
        X1_b = X1_b.to(device)
        X2_b = X2_b.to(device)
        y_b  = y_b.to(device)

        optimizer.zero_grad()
        y_hat, fused = model_ca(X1_b, X2_b)
        loss = loss_fn(y_hat, y_b)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_ca.parameters(), max_norm=1.0)
        optimizer.step()

        train_abs_err.append(torch.abs(y_hat.detach() - y_b).cpu().numpy())

    train_mae = float(np.mean(np.concatenate(train_abs_err)))
    history_ca["train_mae"].append(train_mae)

    model_ca.eval()
    val_abs_err = []
    with torch.no_grad():
        for X1_b, X2_b, y_b in dl_ca_val:
            X1_b = X1_b.to(device)
            X2_b = X2_b.to(device)
            y_b  = y_b.to(device)

            y_hat, fused = model_ca(X1_b, X2_b)
            val_abs_err.append(torch.abs(y_hat - y_b).cpu().numpy())

    val_mae = float(np.mean(np.concatenate(val_abs_err)))
    history_ca["val_mae"].append(val_mae)

    print(f"[Cross-Attn G+P] Epoch {epoch:03d} | train MAE = {train_mae:.4f}, val MAE = {val_mae:.4f}")

    if val_mae < best_val_mae - 1e-4:
        best_val_mae = val_mae
        best_state_dict = {k: v.cpu().clone() for k, v in model_ca.state_dict().items()}
        best_epoch = epoch
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"[Cross-Attn G+P] Early stopping at epoch {epoch}, best_epoch = {best_epoch}")
            break

if best_state_dict is not None:
    model_ca.load_state_dict(best_state_dict)
model_ca.to(device)
model_ca.eval()

encoder_gp = model_ca.encoder
encoder_gp.eval().to(device)

ds_all_ca = PairDataset(X_graph_ca_all_std, X_phys_ca_all_std, y_all)
dl_all_ca = DataLoader(ds_all_ca, batch_size=batch_size, shuffle=False)

fused_all_list = []
with torch.no_grad():
    for X1_b, X2_b, y_b in dl_all_ca:
        X1_b = X1_b.to(device)
        X2_b = X2_b.to(device)
        fused = encoder_gp(X1_b, X2_b)
        fused_all_list.append(fused.cpu().numpy())

fused_all_GP = np.concatenate(fused_all_list, axis=0)
print("fused_all_G_P 形状:", fused_all_GP.shape)

torch.save(
    {
        "state_dict": best_state_dict,
        "config": {
            "dim_a": dim_a,
            "dim_b": dim_b,
            "hidden_dim": 256,
            "num_heads": 4,
            "mlp_hidden": 512,
            "dropout": 0.1,
        },
    },
    MODELS_CA / "crossattn_G_P_best.pt"
)

np.save(OUT_GP / "fused_all_G_P.npy", fused_all_GP)
np.save(OUT_GP / "row_id_all_G_P.npy", rowid_all)
np.save(OUT_GP / "y_all_G_P.npy",      y_all)
np.save(OUT_GP / "groups_all_G_P.npy", groups_all)

with open(OUT_GP / "crossattn_G_P_history.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_epoch": best_epoch,
            "best_val_mae": float(best_val_mae),
            "history": history_ca,
            "n_all": int(len(y_all)),
            "n_train": int(len(ca_train_idx)),
            "n_val": int(len(ca_val_idx)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ Cross-Attn (Graph+Phys) 训练完成，fused_all_G_P 已保存。")


Cross-Attn G+P train 样本数: 2510
Cross-Attn G+P val   样本数: 593
[Cross-Attn G+P] Epoch 001 | train MAE = 0.7107, val MAE = 0.5925
[Cross-Attn G+P] Epoch 002 | train MAE = 0.5778, val MAE = 0.5691
[Cross-Attn G+P] Epoch 003 | train MAE = 0.5620, val MAE = 0.5692
[Cross-Attn G+P] Epoch 004 | train MAE = 0.5517, val MAE = 0.5788
[Cross-Attn G+P] Epoch 005 | train MAE = 0.5454, val MAE = 0.5649
[Cross-Attn G+P] Epoch 006 | train MAE = 0.5452, val MAE = 0.5915
[Cross-Attn G+P] Epoch 007 | train MAE = 0.5342, val MAE = 0.5820
[Cross-Attn G+P] Epoch 008 | train MAE = 0.5224, val MAE = 0.5734
[Cross-Attn G+P] Epoch 009 | train MAE = 0.5155, val MAE = 0.5752
[Cross-Attn G+P] Epoch 010 | train MAE = 0.5152, val MAE = 0.5699
[Cross-Attn G+P] Epoch 011 | train MAE = 0.5068, val MAE = 0.5729
[Cross-Attn G+P] Epoch 012 | train MAE = 0.5091, val MAE = 0.5610
[Cross-Attn G+P] Epoch 013 | train MAE = 0.5010, val MAE = 0.5785
[Cross-Attn G+P] Epoch 014 | train MAE = 0.5013, val MAE = 0.5529
[Cross-Attn G+P

In [7]:
# ===== Cell 7: RF 端 8:2 划分 & 构造 RF 输入特征 (G+P) =====

gss_rf = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=4031)
N = len(y_all)

rf_train_idx, rf_test_idx = next(
    gss_rf.split(np.zeros(N), y_all, groups_all)
)
rf_train_idx = np.array(rf_train_idx, dtype=np.int64)
rf_test_idx  = np.array(rf_test_idx, dtype=np.int64)

print("RF G+P train 样本数:", len(rf_train_idx))
print("RF G+P test  样本数:", len(rf_test_idx))

np.save(OUT_GP / "rf_train_idx_G_P.npy", rf_train_idx)
np.save(OUT_GP / "rf_test_idx_G_P.npy",  rf_test_idx)

scaler_dur_rf = StandardScaler().fit(dur_raw[rf_train_idx])
dur_all_std_rf = scaler_dur_rf.transform(dur_raw)
print("dur_all_std_rf 形状:", dur_all_std_rf.shape)

X_all_RF = np.concatenate(
    [fused_all_GP, dur_all_std_rf, cat_all],
    axis=1
)
print("X_all_RF 形状:", X_all_RF.shape)

X_train_RF = X_all_RF[rf_train_idx]
y_train_RF = y_all[rf_train_idx]
groups_train_RF = groups_all[rf_train_idx]

X_test_RF  = X_all_RF[rf_test_idx]
y_test_RF  = y_all[rf_test_idx]

print("X_train_RF 形状:", X_train_RF.shape)
print("X_test_RF  形状:", X_test_RF.shape)

np.save(OUT_GP / "X_all_RF_G_P.npy", X_all_RF)
np.save(OUT_GP / "X_train_RF_G_P.npy", X_train_RF)
np.save(OUT_GP / "X_test_RF_G_P.npy",  X_test_RF)


RF G+P train 样本数: 2462
RF G+P test  样本数: 641
dur_all_std_rf 形状: (3103, 1)
X_all_RF 形状: (3103, 260)
X_train_RF 形状: (2462, 260)
X_test_RF  形状: (641, 260)


In [8]:
# ===== Cell 8: RF 十折 CV + RandomizedSearchCV (G+P) =====

param_distributions_rf = {
    "n_estimators":      [200, 300, 500, 800, 1000],
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4],
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

rf_base = RandomForestRegressor(
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)

cv_inner = GroupKFold(n_splits=10)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_distributions_rf,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    cv=cv_inner,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("开始在 RF 80% train 上做十折 GroupKFold 随机超参搜索（G+P fused + meta）...")
rf_search.fit(X_train_RF, y_train_RF, groups=groups_train_RF)

best_params_rf = rf_search.best_params_
best_score_rf  = rf_search.best_score_

print("\n[G+P+meta→RF] 最优超参：")
print(best_params_rf)
print("最优 CV 分数 (neg MAE):", best_score_rf)

rf_final = RandomForestRegressor(
    **best_params_rf,
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)
rf_final.fit(X_train_RF, y_train_RF)

y_train_pred_RF = rf_final.predict(X_train_RF)
y_test_pred_RF  = rf_final.predict(X_test_RF)

metrics_train_RF = compute_regression_metrics(y_train_RF, y_train_pred_RF)
metrics_test_RF  = compute_regression_metrics(y_test_RF,  y_test_pred_RF)

print("\n===== [G+P fused + meta → RF] RF train80% 指标 =====")
for k, v in metrics_train_RF.items():
    print(f"{k}: {v:.4f}")

print("\n===== [G+P fused + meta → RF] RF test20% 指标 =====")
for k, v in metrics_test_RF.items():
    print(f"{k}: {v:.4f}")

joblib.dump(
    {
        "model": rf_final,
        "scaler_graph_ca": scaler_graph_ca,
        "scaler_phys_ca": scaler_phys_ca,
        "encoder_state_dict": best_state_dict,
        "scaler_dur_rf": scaler_dur_rf,
        "cat_feature_names": cat_feature_names,
        "config": {
            "hidden_dim": 256,
            "GLOBAL_SEED": int(GLOBAL_SEED),
            "param_distributions_rf": param_distributions_rf,
        },
    },
    MODELS_RF / "rf_G_P_meta_from_CA.joblib"
)

np.save(OUT_GP / "y_train_RF_G_P.npy", y_train_RF)
np.save(OUT_GP / "y_test_RF_G_P.npy",  y_test_RF)
np.save(OUT_GP / "y_train_pred_RF_G_P.npy", y_train_pred_RF)
np.save(OUT_GP / "y_test_pred_RF_G_P.npy",  y_test_pred_RF)

with open(OUT_GP / "metrics_RF_G_P_meta_from_CA.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params_rf": best_params_rf,
            "best_score_cv_neg_mae": float(best_score_rf),
            "train80_metrics": metrics_train_RF,
            "test20_metrics": metrics_test_RF,
            "n_all": int(len(y_all)),
            "n_train80_rf": int(len(y_train_RF)),
            "n_test20_rf": int(len(y_test_RF)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ RF (G+P fused + meta) 训练 & 评估完成。")


开始在 RF 80% train 上做十折 GroupKFold 随机超参搜索（G+P fused + meta）...
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=40, max_features=0.8, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=20.4min
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time= 3.2min
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time= 3.1min
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time= 9.1min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 1.1min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  57.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 1.2min
[CV] END max_depth=40, max_features