In [1]:
# ===== Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import json
import numpy as np
import pandas as pd

from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import joblib

# ---- 全局随机种子 ----
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用设备:", device)


def compute_regression_metrics(y_true, y_pred):
    """计算 MAE / RMSE / R2 / Pearson_r"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))

    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        pr = float("nan")
    else:
        pr, _ = pearsonr(y_true, y_pred)
        pr = float(pr)

    return {"MAE": mae, "RMSE": rmse, "R2": r2, "Pearson_r": pr}


def np_encoder(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")


使用设备: cuda


In [2]:
# ===== Cell 2: 路径 & 读取 df =====
ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# Text CLS 嵌入
TEXT_DIR     = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_768 = TEXT_DIR / "reg_smiles_cls_embeddings_all.npy"

# PhysChem MLP 嵌入 + row_id
PHY_DIR        = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_DIR / "emb_physchem_mlp_all.npy"
PHY_ROWID_PATH = PHY_DIR / "row_id_clean.npy"

# 原始数据
DATA_PATH = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")

# 输出目录
MID_ROOT   = ROOT_MULTI / "mid(T+P)"
OUT_TP     = MID_ROOT / "text_physchem_meta"
MODELS_CA  = OUT_TP / "models_crossattn"
MODELS_RF  = OUT_TP / "models_rf"
for d in [MID_ROOT, OUT_TP, MODELS_CA, MODELS_RF]:
    d.mkdir(parents=True, exist_ok=True)

# 列名
SMILES_COL   = "SMILES_Canonical_RDKit"
DURATION_COL = "Duration_Value(hour)"
EFFECT_COL   = "Effect"
ENDPOINT_COL = "Endpoint"
LABEL_RAW    = "mgperL"
LABEL_LOG    = "mgperL_log"
LABEL_COL    = LABEL_LOG

print("TEXT_EMB_768:", TEXT_EMB_768)
print("PHY_EMB_PATH:", PHY_EMB_PATH)
print("DATA_PATH   :", DATA_PATH)
print("OUT_TP      :", OUT_TP)

# 读取 df
df = pd.read_excel(DATA_PATH, engine="openpyxl")
if "row_id" not in df.columns:
    df = df.reset_index().rename(columns={"index": "row_id"})
df["row_id"] = df["row_id"].astype(int)

# 构造 mgperL_log（如需）
if LABEL_LOG not in df.columns:
    df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")
    mask_valid = df[LABEL_RAW] > 0
    df[LABEL_LOG] = np.where(mask_valid, np.log10(df[LABEL_RAW]), np.nan)

print("df 形状:", df.shape)
print(df[["row_id", SMILES_COL, LABEL_RAW, LABEL_LOG]].head())


TEXT_EMB_768: /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs/reg_smiles_cls_embeddings_all.npy
PHY_EMB_PATH: /root/Invertebrates_EC50_multi_fusion/phychem/physchem_mlp_rf_v2/emb_physchem_mlp_all.npy
DATA_PATH   : /root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx
OUT_TP      : /root/Invertebrates_EC50_multi_fusion/mid(T+P)/text_physchem_meta
df 形状: (3620, 36)
   row_id    SMILES_Canonical_RDKit  mgperL  mgperL_log
0       0        [Cl-].[Cl-].[Zn+2]     1.3    0.113943
1       1  O=S(=O)([O-])[O-].[Zn+2]     2.5    0.397940
2       2        [Cl-].[Cl-].[Pb+2]    40.8    1.610660
3       3  O=S(=O)([O-])[O-].[Cu+2]     1.9    0.278754
4       4  O=S(=O)([O-])[O-].[Cu+2]     0.6   -0.221849


In [3]:
# ===== Cell 3: 加载 Text & PhysChem，对齐 + 构造 meta =====

# Text：CLS 按 df 行号对齐
text_all_full = np.load(TEXT_EMB_768)
assert text_all_full.shape[0] == len(df), "text_all_full 行数应与 df 行数一致"
print("text_all_full 形状:", text_all_full.shape)

# PhysChem
phys_emb   = np.load(PHY_EMB_PATH)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)
print("phys_emb 形状:", phys_emb.shape)
print("rowid_phys 范围:", rowid_phys.min(), "→", rowid_phys.max())

df_indexed = df.set_index("row_id")

ids_text = set(df_indexed.index.tolist())   # text 按 df 行号
ids_phys = set(rowid_phys.tolist())
ids_all  = set(df_indexed.index.tolist())

ids_inter = sorted(list(ids_text & ids_phys & ids_all))
print("Text+Phys row_id 交集初始样本数:", len(ids_inter))

idx_map_phys = {rid: i for i, rid in enumerate(rowid_phys)}

meta_list    = []
X_text_list  = []
X_phys_list  = []
y_list       = []
rid_list     = []

for rid in ids_inter:
    row_meta = df_indexed.loc[rid]
    label = row_meta[LABEL_COL]
    # 要求标签 & meta 都存在
    if pd.isna(label) or not np.isfinite(label):
        continue
    if pd.isna(row_meta.get(DURATION_COL)) or pd.isna(row_meta.get(EFFECT_COL)) or pd.isna(row_meta.get(ENDPOINT_COL)):
        continue

    meta_list.append(row_meta)
    X_text_list.append(text_all_full[rid])                      # Text 直接用 row_id 索引
    X_phys_list.append(phys_emb[idx_map_phys[rid]])             # PhysChem 用 rowid_phys 映射
    y_list.append(label)
    rid_list.append(rid)

meta_tp  = pd.DataFrame(meta_list).reset_index(drop=True)
X_text   = np.stack(X_text_list, axis=0)
X_phys   = np.stack(X_phys_list, axis=0)
y_all    = np.array(y_list, dtype=float)
rowid_all= np.array(rid_list, dtype=int)

print("过滤后样本数:", len(y_all))
print("X_text  形状:", X_text.shape)
print("X_phys  形状:", X_phys.shape)

# ========== 构造 meta：Duration + Effect/Endpoint one-hot ==========
# Duration
meta_tp[DURATION_COL] = pd.to_numeric(meta_tp[DURATION_COL], errors="coerce")
dur_median = meta_tp[DURATION_COL].median()
meta_tp[DURATION_COL] = meta_tp[DURATION_COL].fillna(dur_median)
dur_raw = meta_tp[[DURATION_COL]].values.astype(float)  # (N, 1)

# One-hot: Effect + Endpoint
cat_cols = [EFFECT_COL, ENDPOINT_COL]
cat_dummies = pd.get_dummies(meta_tp[cat_cols], dummy_na=False)
cat_all = cat_dummies.values.astype(float)
cat_feature_names = list(cat_dummies.columns)

print("dur_raw 形状:", dur_raw.shape)
print("cat_all 形状:", cat_all.shape)

# 分组（按 SMILES）
groups_all = meta_tp[SMILES_COL].astype(str).values
print("总样本数 N:", len(y_all))


text_all_full 形状: (3620, 768)
phys_emb 形状: (3406, 64)
rowid_phys 范围: 0 → 3619
Text+Phys row_id 交集初始样本数: 3406
过滤后样本数: 3406
X_text  形状: (3406, 768)
X_phys  形状: (3406, 64)
dur_raw 形状: (3406, 1)
cat_all 形状: (3406, 3)
总样本数 N: 3406


In [4]:
# ===== Cell 4: 原始数值块 =====
X_text_raw = X_text      # (N, d_text)
X_phys_raw = X_phys      # (N, d_phys)
print("X_text_raw 形状:", X_text_raw.shape)
print("X_phys_raw 形状:", X_phys_raw.shape)
print("dur_raw    形状:", dur_raw.shape)


X_text_raw 形状: (3406, 768)
X_phys_raw 形状: (3406, 64)
dur_raw    形状: (3406, 1)


In [5]:
# ===== Cell 5: Dataset & Cross-Attn 模型（Text+Phys） =====

class PairDataset(Dataset):
    def __init__(self, X1, X2, y):
        self.X1 = torch.from_numpy(X1).float()
        self.X2 = torch.from_numpy(X2).float()
        self.y  = torch.from_numpy(y).float()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]


class CrossAttnEncoder(nn.Module):
    """两模态编码，输出融合 embedding"""
    def __init__(self, dim_a, dim_b, hidden_dim=256, num_heads=4, dropout=0.1):
        super().__init__()
        self.proj_a = nn.Linear(dim_a, hidden_dim)
        self.proj_b = nn.Linear(dim_b, hidden_dim)

        self.attn = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=False,
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, xa, xb):
        # xa, xb: (B, d_a/d_b)
        h_a = self.proj_a(xa)  # (B, hidden)
        h_b = self.proj_b(xb)  # (B, hidden)
        tokens = torch.stack([h_a, h_b], dim=0)  # (2, B, hidden)
        attn_out, _ = self.attn(tokens, tokens, tokens)  # (2, B, hidden)
        fused = attn_out.mean(dim=0)  # (B, hidden)
        return self.dropout(fused)


class CrossAttnWithHead(nn.Module):
    """Encoder + 小回归头，用于训练 encoder"""
    def __init__(self, dim_a, dim_b,
                 hidden_dim=256,
                 num_heads=4,
                 mlp_hidden=512,
                 dropout=0.1):
        super().__init__()
        self.encoder = CrossAttnEncoder(dim_a, dim_b, hidden_dim, num_heads, dropout)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, mlp_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, 1),
        )

    def forward(self, xa, xb):
        fused = self.encoder(xa, xb)          # (B, hidden)
        out   = self.head(fused).squeeze(-1)  # (B,)
        return out, fused


In [6]:
# ===== Cell 6: Cross-Attn 按 SMILES 8:2 划分 & 训练（Text+Phys） =====

# 1) Cross-Attn 自己的 8:2 划分（按 SMILES 分组）
gss_ca = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=3025)
N = len(y_all)
idx_all = np.arange(N)

ca_train_idx, ca_val_idx = next(
    gss_ca.split(np.zeros(N), y_all, groups_all)
)
ca_train_idx = np.array(ca_train_idx, dtype=np.int64)
ca_val_idx   = np.array(ca_val_idx, dtype=np.int64)

print("Cross-Attn T+P train 样本数:", len(ca_train_idx))
print("Cross-Attn T+P val   样本数:", len(ca_val_idx))

np.save(OUT_TP / "ca_train_idx_T_P.npy", ca_train_idx)
np.save(OUT_TP / "ca_val_idx_T_P.npy",   ca_val_idx)

# 2) 在 Cross-Attn train80% 上拟合 Text / Phys 的 scaler
scaler_text_ca = StandardScaler().fit(X_text_raw[ca_train_idx])
scaler_phys_ca = StandardScaler().fit(X_phys_raw[ca_train_idx])

X_text_ca_all_std = scaler_text_ca.transform(X_text_raw)
X_phys_ca_all_std = scaler_phys_ca.transform(X_phys_raw)

X_text_ca_train = X_text_ca_all_std[ca_train_idx]
X_phys_ca_train = X_phys_ca_all_std[ca_train_idx]
y_ca_train      = y_all[ca_train_idx]

X_text_ca_val = X_text_ca_all_std[ca_val_idx]
X_phys_ca_val = X_phys_ca_all_std[ca_val_idx]
y_ca_val      = y_all[ca_val_idx]

print("X_text_ca_train 形状:", X_text_ca_train.shape)
print("X_phys_ca_train 形状:", X_phys_ca_train.shape)

# 3) DataLoader
batch_size = 64
ds_ca_tr  = PairDataset(X_text_ca_train, X_phys_ca_train, y_ca_train)
ds_ca_val = PairDataset(X_text_ca_val,   X_phys_ca_val,   y_ca_val)

dl_ca_tr  = DataLoader(ds_ca_tr,  batch_size=batch_size, shuffle=True,  drop_last=False)
dl_ca_val = DataLoader(ds_ca_val, batch_size=batch_size, shuffle=False, drop_last=False)

# 4) 定义 Cross-Attn 模型
dim_a = X_text_ca_train.shape[1]
dim_b = X_phys_ca_train.shape[1]

model_ca = CrossAttnWithHead(
    dim_a=dim_a,
    dim_b=dim_b,
    hidden_dim=256,
    num_heads=4,
    mlp_hidden=512,
    dropout=0.1,
).to(device)

loss_fn = nn.L1Loss()
optimizer = torch.optim.AdamW(
    model_ca.parameters(),
    lr=5e-4,
    weight_decay=1e-4,
)

max_epochs = 80
patience   = 10
best_val_mae = float("inf")
best_state_dict = None
best_epoch = -1
epochs_no_improve = 0

history_ca = {"train_mae": [], "val_mae": []}

# 5) 训练（val 做 early stopping）
for epoch in range(1, max_epochs + 1):
    model_ca.train()
    train_abs_err = []

    for X1_b, X2_b, y_b in dl_ca_tr:
        X1_b = X1_b.to(device)
        X2_b = X2_b.to(device)
        y_b  = y_b.to(device)

        optimizer.zero_grad()
        y_hat, fused = model_ca(X1_b, X2_b)
        loss = loss_fn(y_hat, y_b)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_ca.parameters(), max_norm=1.0)
        optimizer.step()

        train_abs_err.append(torch.abs(y_hat.detach() - y_b).cpu().numpy())

    train_mae = float(np.mean(np.concatenate(train_abs_err)))
    history_ca["train_mae"].append(train_mae)

    model_ca.eval()
    val_abs_err = []
    with torch.no_grad():
        for X1_b, X2_b, y_b in dl_ca_val:
            X1_b = X1_b.to(device)
            X2_b = X2_b.to(device)
            y_b  = y_b.to(device)

            y_hat, fused = model_ca(X1_b, X2_b)
            val_abs_err.append(torch.abs(y_hat - y_b).cpu().numpy())

    val_mae = float(np.mean(np.concatenate(val_abs_err)))
    history_ca["val_mae"].append(val_mae)

    print(f"[Cross-Attn T+P] Epoch {epoch:03d} | train MAE = {train_mae:.4f}, val MAE = {val_mae:.4f}")

    if val_mae < best_val_mae - 1e-4:
        best_val_mae = val_mae
        best_state_dict = {k: v.cpu().clone() for k, v in model_ca.state_dict().items()}
        best_epoch = epoch
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"[Cross-Attn T+P] Early stopping at epoch {epoch}, best_epoch = {best_epoch}")
            break

# 6) 加载最佳权重，抽 fused_all_T_P
if best_state_dict is not None:
    model_ca.load_state_dict(best_state_dict)
model_ca.to(device)
model_ca.eval()

encoder_tp = model_ca.encoder
encoder_tp.eval().to(device)

ds_all_ca = PairDataset(X_text_ca_all_std, X_phys_ca_all_std, y_all)
dl_all_ca = DataLoader(ds_all_ca, batch_size=batch_size, shuffle=False)

fused_all_list = []
with torch.no_grad():
    for X1_b, X2_b, y_b in dl_all_ca:
        X1_b = X1_b.to(device)
        X2_b = X2_b.to(device)
        fused = encoder_tp(X1_b, X2_b)
        fused_all_list.append(fused.cpu().numpy())

fused_all_TP = np.concatenate(fused_all_list, axis=0)  # (N, hidden_dim)
print("fused_all_TP 形状:", fused_all_TP.shape)

# 7) 保存 Cross-Attn 模型 & fusion embeddings
torch.save(
    {
        "state_dict": best_state_dict,
        "config": {
            "dim_a": dim_a,
            "dim_b": dim_b,
            "hidden_dim": 256,
            "num_heads": 4,
            "mlp_hidden": 512,
            "dropout": 0.1,
        },
    },
    MODELS_CA / "crossattn_T_P_best.pt"
)

np.save(OUT_TP / "fused_all_T_P.npy", fused_all_TP)
np.save(OUT_TP / "row_id_all_T_P.npy", rowid_all)
np.save(OUT_TP / "y_all_T_P.npy",      y_all)
np.save(OUT_TP / "groups_all_T_P.npy", groups_all)

with open(OUT_TP / "crossattn_T_P_history.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_epoch": best_epoch,
            "best_val_mae": float(best_val_mae),
            "history": history_ca,
            "n_all": int(len(y_all)),
            "n_train": int(len(ca_train_idx)),
            "n_val": int(len(ca_val_idx)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ Cross-Attn (Text+Phys) 训练完成，fused_all_T_P 已保存。")


Cross-Attn T+P train 样本数: 2733
Cross-Attn T+P val   样本数: 673
X_text_ca_train 形状: (2733, 768)
X_phys_ca_train 形状: (2733, 64)
[Cross-Attn T+P] Epoch 001 | train MAE = 0.6539, val MAE = 0.5834
[Cross-Attn T+P] Epoch 002 | train MAE = 0.5431, val MAE = 0.5475
[Cross-Attn T+P] Epoch 003 | train MAE = 0.5246, val MAE = 0.5570
[Cross-Attn T+P] Epoch 004 | train MAE = 0.5075, val MAE = 0.5505
[Cross-Attn T+P] Epoch 005 | train MAE = 0.5089, val MAE = 0.5365
[Cross-Attn T+P] Epoch 006 | train MAE = 0.4982, val MAE = 0.5476
[Cross-Attn T+P] Epoch 007 | train MAE = 0.5011, val MAE = 0.5606
[Cross-Attn T+P] Epoch 008 | train MAE = 0.4968, val MAE = 0.5277
[Cross-Attn T+P] Epoch 009 | train MAE = 0.4822, val MAE = 0.5305
[Cross-Attn T+P] Epoch 010 | train MAE = 0.4894, val MAE = 0.5272
[Cross-Attn T+P] Epoch 011 | train MAE = 0.4773, val MAE = 0.5417
[Cross-Attn T+P] Epoch 012 | train MAE = 0.4751, val MAE = 0.5585
[Cross-Attn T+P] Epoch 013 | train MAE = 0.4662, val MAE = 0.5431
[Cross-Attn T+P] E

In [7]:
# ===== Cell 7: RF 端 8:2 划分 & 构造 RF 输入特征 =====

gss_rf = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=3031)
N = len(y_all)

rf_train_idx, rf_test_idx = next(
    gss_rf.split(np.zeros(N), y_all, groups_all)
)
rf_train_idx = np.array(rf_train_idx, dtype=np.int64)
rf_test_idx  = np.array(rf_test_idx, dtype=np.int64)

print("RF T+P train 样本数:", len(rf_train_idx))
print("RF T+P test  样本数:", len(rf_test_idx))

np.save(OUT_TP / "rf_train_idx_T_P.npy", rf_train_idx)
np.save(OUT_TP / "rf_test_idx_T_P.npy",  rf_test_idx)

# RF 专用的 duration scaler（只在 RF train80% 上 fit）
scaler_dur_rf = StandardScaler().fit(dur_raw[rf_train_idx])
dur_all_std_rf = scaler_dur_rf.transform(dur_raw)
print("dur_all_std_rf 形状:", dur_all_std_rf.shape)

# RF 输入特征：fusion embedding + duration_std + one-hot
X_all_RF = np.concatenate(
    [fused_all_TP, dur_all_std_rf, cat_all],
    axis=1
)
print("X_all_RF 形状:", X_all_RF.shape)

X_train_RF = X_all_RF[rf_train_idx]
y_train_RF = y_all[rf_train_idx]
groups_train_RF = groups_all[rf_train_idx]

X_test_RF  = X_all_RF[rf_test_idx]
y_test_RF  = y_all[rf_test_idx]

print("X_train_RF 形状:", X_train_RF.shape)
print("X_test_RF  形状:", X_test_RF.shape)

np.save(OUT_TP / "X_all_RF_T_P.npy", X_all_RF)
np.save(OUT_TP / "X_train_RF_T_P.npy", X_train_RF)
np.save(OUT_TP / "X_test_RF_T_P.npy",  X_test_RF)


RF T+P train 样本数: 2726
RF T+P test  样本数: 680
dur_all_std_rf 形状: (3406, 1)
X_all_RF 形状: (3406, 260)
X_train_RF 形状: (2726, 260)
X_test_RF  形状: (680, 260)


In [8]:
# ===== Cell 8: RF 十折 CV + RandomizedSearchCV (T+P) =====

param_distributions_rf = {
    "n_estimators":      [200, 300, 500, 800, 1000],
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4],
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

rf_base = RandomForestRegressor(
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)

cv_inner = GroupKFold(n_splits=10)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_distributions_rf,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    cv=cv_inner,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("开始在 RF 80% train 上做十折 GroupKFold 随机超参搜索（T+P fused + meta）...")
rf_search.fit(X_train_RF, y_train_RF, groups=groups_train_RF)

best_params_rf = rf_search.best_params_
best_score_rf  = rf_search.best_score_

print("\n[T+P+meta→RF] 最优超参：")
print(best_params_rf)
print("最优 CV 分数 (neg MAE):", best_score_rf)

# 用最优超参在 RF train80% 上重训
rf_final = RandomForestRegressor(
    **best_params_rf,
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)
rf_final.fit(X_train_RF, y_train_RF)

# 评估 train80% & test20%
y_train_pred_RF = rf_final.predict(X_train_RF)
y_test_pred_RF  = rf_final.predict(X_test_RF)

metrics_train_RF = compute_regression_metrics(y_train_RF, y_train_pred_RF)
metrics_test_RF  = compute_regression_metrics(y_test_RF,  y_test_pred_RF)

print("\n===== [T+P fused + meta → RF] RF train80% 指标 =====")
for k, v in metrics_train_RF.items():
    print(f"{k}: {v:.4f}")

print("\n===== [T+P fused + meta → RF] RF test20% 指标 =====")
for k, v in metrics_test_RF.items():
    print(f"{k}: {v:.4f}")

# 保存 RF 模型 & 结果
joblib.dump(
    {
        "model": rf_final,
        "scaler_text_ca": scaler_text_ca,
        "scaler_phys_ca": scaler_phys_ca,
        "encoder_state_dict": best_state_dict,
        "scaler_dur_rf": scaler_dur_rf,
        "cat_feature_names": cat_feature_names,
        "config": {
            "hidden_dim": 256,
            "GLOBAL_SEED": int(GLOBAL_SEED),
            "param_distributions_rf": param_distributions_rf,
        },
    },
    MODELS_RF / "rf_T_P_meta_from_CA.joblib"
)

np.save(OUT_TP / "y_train_RF_T_P.npy", y_train_RF)
np.save(OUT_TP / "y_test_RF_T_P.npy",  y_test_RF)
np.save(OUT_TP / "y_train_pred_RF_T_P.npy", y_train_pred_RF)
np.save(OUT_TP / "y_test_pred_RF_T_P.npy",  y_test_pred_RF)

with open(OUT_TP / "metrics_RF_T_P_meta_from_CA.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params_rf": best_params_rf,
            "best_score_cv_neg_mae": float(best_score_rf),
            "train80_metrics": metrics_train_RF,
            "test20_metrics": metrics_test_RF,
            "n_all": int(len(y_all)),
            "n_train80_rf": int(len(y_train_RF)),
            "n_test20_rf": int(len(y_test_RF)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ RF (T+P fused + meta) 训练 & 评估完成。")


开始在 RF 80% train 上做十折 GroupKFold 随机超参搜索（T+P fused + meta）...
Fitting 10 folds for each of 30 candidates, totalling 300 fits

[T+P+meta→RF] 最优超参：
{'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 0.8, 'max_depth': 40}
最优 CV 分数 (neg MAE): -0.42783054017874633

===== [T+P fused + meta → RF] RF train80% 指标 =====
MAE: 0.1879
RMSE: 0.2923
R2: 0.9408
Pearson_r: 0.9714

===== [T+P fused + meta → RF] RF test20% 指标 =====
MAE: 0.4099
RMSE: 0.5885
R2: 0.7439
Pearson_r: 0.8626

✅ RF (T+P fused + meta) 训练 & 评估完成。
