In [1]:
# ===== Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import json
import numpy as np
import pandas as pd

from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import joblib

# ---- 全局随机种子 ----
GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用设备:", device)


def compute_regression_metrics(y_true, y_pred):
    """计算 MAE / RMSE / R2 / Pearson_r"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))

    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        pr = float("nan")
    else:
        pr, _ = pearsonr(y_true, y_pred)
        pr = float(pr)

    return {"MAE": mae, "RMSE": rmse, "R2": r2, "Pearson_r": pr}


def np_encoder(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")


使用设备: cuda


In [2]:
# ===== Cell 2: 路径 & 读 df =====
ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# Text CLS 全量嵌入（行号对齐 df）
TEXT_DIR     = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_768 = TEXT_DIR / "reg_smiles_cls_embeddings_all.npy"

# Graph embedding + row_id
GRAPH_DIR        = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_DIR / "reg_graph_embeddings.npy"
GRAPH_ROWID_PATH = GRAPH_DIR / "row_id_graph_for_emb.npy"

# 原始数据（含 SMILES / mgperL / Duration / Effect / Endpoint）
DATA_PATH = Path("/root/fusion_dataset/Invertebrates_EC50_unique.xlsx")

# 输出目录
MID_ROOT    = ROOT_MULTI / "mid(T+G)"
OUT_TG      = MID_ROOT / "text_graph_meta"
MODELS_CA   = OUT_TG / "models_crossattn"
MODELS_RF   = OUT_TG / "models_rf"
for d in [MID_ROOT, OUT_TG, MODELS_CA, MODELS_RF]:
    d.mkdir(parents=True, exist_ok=True)

# 列名
SMILES_COL   = "SMILES_Canonical_RDKit"
DURATION_COL = "Duration_Value(hour)"
EFFECT_COL   = "Effect"
ENDPOINT_COL = "Endpoint"
LABEL_RAW    = "mgperL"
LABEL_LOG    = "mgperL_log"   # 若不存在就自己建
LABEL_COL    = LABEL_LOG

print("TEXT_EMB_768 :", TEXT_EMB_768)
print("GRAPH_EMB_PATH:", GRAPH_EMB_PATH)
print("DATA_PATH    :", DATA_PATH)
print("OUT_TG       :", OUT_TG)

# ===== 读取 df & 构造标签 =====
df = pd.read_excel(DATA_PATH, engine="openpyxl")

if "row_id" not in df.columns:
    df = df.reset_index().rename(columns={"index": "row_id"})
df["row_id"] = df["row_id"].astype(int)

if LABEL_LOG not in df.columns:
    df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")
    mask_valid = df[LABEL_RAW] > 0
    df[LABEL_LOG] = np.where(mask_valid, np.log10(df[LABEL_RAW]), np.nan)

print("df 形状:", df.shape)
print(df[["row_id", SMILES_COL, LABEL_RAW, LABEL_LOG]].head())


TEXT_EMB_768 : /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs/reg_smiles_cls_embeddings_all.npy
GRAPH_EMB_PATH: /root/Invertebrates_EC50_multi_fusion/graph/graph_outputs/reg_graph_embeddings.npy
DATA_PATH    : /root/fusion_dataset/Invertebrates_EC50_unique.xlsx
OUT_TG       : /root/Invertebrates_EC50_multi_fusion/mid(T+G)/text_graph_meta
df 形状: (3620, 12)
   row_id    SMILES_Canonical_RDKit  mgperL  mgperL_log
0       0        [Cl-].[Cl-].[Zn+2]     1.3    0.113943
1       1  O=S(=O)([O-])[O-].[Zn+2]     2.5    0.397940
2       2        [Cl-].[Cl-].[Pb+2]    40.8    1.610660
3       3  O=S(=O)([O-])[O-].[Cu+2]     1.9    0.278754
4       4  O=S(=O)([O-])[O-].[Cu+2]     0.6   -0.221849


In [3]:
# ===== Cell 3: 加载 Text & Graph + 对齐 + 构造 meta =====

# Text 全量 CLS
text_all_full = np.load(TEXT_EMB_768)
assert text_all_full.shape[0] == len(df), "text_all_full 行数应与 df 一致"
print("text_all_full 形状:", text_all_full.shape)

# Graph
graph_emb   = np.load(GRAPH_EMB_PATH)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)
print("graph_emb 形状:", graph_emb.shape)
print("rowid_graph 范围:", rowid_graph.min(), "→", rowid_graph.max())

df_indexed = df.set_index("row_id")

ids_text  = set(df_indexed.index.tolist())          # text 是按 df 行号
ids_graph = set(rowid_graph.tolist())
ids_all   = set(df_indexed.index.tolist())

ids_inter = sorted(list(ids_text & ids_graph & ids_all))
print("Text+Graph row_id 交集初始样本数:", len(ids_inter))

idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}

meta_list   = []
X_text_list = []
X_graph_list= []
y_list      = []
rid_list    = []

for rid in ids_inter:
    row_meta = df_indexed.loc[rid]
    label = row_meta[LABEL_COL]
    # 要求：label 不缺失，duration/effect/endpoint 都存在
    if pd.isna(label) or not np.isfinite(label):
        continue
    if pd.isna(row_meta.get(DURATION_COL)) or pd.isna(row_meta.get(EFFECT_COL)) or pd.isna(row_meta.get(ENDPOINT_COL)):
        continue

    meta_list.append(row_meta)
    X_text_list.append(text_all_full[rid])            # Text CLS：按 row_id 取
    X_graph_list.append(graph_emb[idx_map_graph[rid]])
    y_list.append(label)
    rid_list.append(rid)

meta_tg  = pd.DataFrame(meta_list).reset_index(drop=True)
X_text   = np.stack(X_text_list, axis=0)
X_graph  = np.stack(X_graph_list, axis=0)
y_all    = np.array(y_list, dtype=float)
rowid_all= np.array(rid_list, dtype=int)

print("过滤后样本数:", len(y_all))
print("X_text  形状:", X_text.shape)
print("X_graph 形状:", X_graph.shape)

# ========== 构造 meta：Duration + Effect/Endpoint one-hot ==========
# Duration
meta_tg[DURATION_COL] = pd.to_numeric(meta_tg[DURATION_COL], errors="coerce")
dur_median = meta_tg[DURATION_COL].median()
meta_tg[DURATION_COL] = meta_tg[DURATION_COL].fillna(dur_median)
dur_all_raw = meta_tg[[DURATION_COL]].values.astype(float)

# One-hot: Effect + Endpoint
cat_cols = [EFFECT_COL, ENDPOINT_COL]
cat_dummies = pd.get_dummies(meta_tg[cat_cols], dummy_na=False)
cat_all = cat_dummies.values.astype(float)
cat_feature_names = list(cat_dummies.columns)

print("dur_all_raw 形状:", dur_all_raw.shape)
print("cat_all     形状:", cat_all.shape)

# 分组（按 SMILES）
groups_all = meta_tg[SMILES_COL].astype(str).values
print("总样本数 N:", len(y_all))


text_all_full 形状: (3620, 768)
graph_emb 形状: (3213, 256)
rowid_graph 范围: 1 → 3619
Text+Graph row_id 交集初始样本数: 3213
过滤后样本数: 3213
X_text  形状: (3213, 768)
X_graph 形状: (3213, 256)
dur_all_raw 形状: (3213, 1)
cat_all     形状: (3213, 3)
总样本数 N: 3213


In [4]:
# ===== Cell 4: 标准化三个数值块（Text / Graph / Duration） =====

# Text / Graph / Duration 只用 train 部分来拟合 scaler，但目前我们还没划分。
# 这里先对“全部”算一下 raw，后面在 Cross-Attn 和 RF 各自的 8:2 里面再 fit 分别的 scaler 也可以。
# 为了简单，Cross-Attn 和 RF 各自有自己的一套 scaler（你说不要求完全一致）。

X_text_raw  = X_text        # (N, d_t)
X_graph_raw = X_graph       # (N, d_g)
dur_raw     = dur_all_raw   # (N, 1)

print("X_text_raw  形状:", X_text_raw.shape)
print("X_graph_raw 形状:", X_graph_raw.shape)
print("dur_raw     形状:", dur_raw.shape)


X_text_raw  形状: (3213, 768)
X_graph_raw 形状: (3213, 256)
dur_raw     形状: (3213, 1)


In [5]:
# ===== Cell 5: Dataset & Cross-Attn 模型（Text+Graph） =====

class PairDataset(Dataset):
    def __init__(self, X1, X2, y):
        self.X1 = torch.from_numpy(X1).float()
        self.X2 = torch.from_numpy(X2).float()
        self.y  = torch.from_numpy(y).float()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]


class CrossAttnEncoder(nn.Module):
    """只负责把两模态融合成 fused embedding"""
    def __init__(self, dim_a, dim_b, hidden_dim=256, num_heads=4, dropout=0.1):
        super().__init__()
        self.proj_a = nn.Linear(dim_a, hidden_dim)
        self.proj_b = nn.Linear(dim_b, hidden_dim)

        self.attn = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=False,
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, xa, xb):
        # xa, xb: (B, d_a/d_b)
        h_a = self.proj_a(xa)  # (B, hidden)
        h_b = self.proj_b(xb)  # (B, hidden)
        tokens = torch.stack([h_a, h_b], dim=0)  # (2, B, hidden)
        attn_out, _ = self.attn(tokens, tokens, tokens)  # (2, B, hidden)
        fused = attn_out.mean(dim=0)  # (B, hidden)
        return self.dropout(fused)    # (B, hidden)


class CrossAttnWithHead(nn.Module):
    """Encoder + 小回归头（只用于训练 encoder）"""
    def __init__(self, dim_a, dim_b,
                 hidden_dim=256,
                 num_heads=4,
                 mlp_hidden=512,
                 dropout=0.1):
        super().__init__()
        self.encoder = CrossAttnEncoder(dim_a, dim_b, hidden_dim, num_heads, dropout)
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, mlp_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, 1),
        )

    def forward(self, xa, xb):
        fused = self.encoder(xa, xb)          # (B, hidden)
        out   = self.head(fused).squeeze(-1)  # (B,)
        return out, fused


In [6]:
# ===== Cell 6: Cross-Attn 按 SMILES 8:2 划分 train/val & 训练 =====

# 1) 按 SMILES 分组，8:2 给 Cross-Attn 用（注意：这是 Cross-Attn 自己的一套划分）
gss_ca = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2025)
N = len(y_all)
idx_all = np.arange(N)

ca_train_idx, ca_val_idx = next(
    gss_ca.split(np.zeros(N), y_all, groups_all)
)
ca_train_idx = np.array(ca_train_idx, dtype=np.int64)
ca_val_idx   = np.array(ca_val_idx, dtype=np.int64)

print("Cross-Attn train 样本数:", len(ca_train_idx))
print("Cross-Attn val   样本数:", len(ca_val_idx))

np.save(OUT_TG / "ca_train_idx_T_G.npy", ca_train_idx)
np.save(OUT_TG / "ca_val_idx_T_G.npy",   ca_val_idx)

# 2) 在 Cross-Attn 的 train80% 上拟合 scaler（Text / Graph）
scaler_text_ca  = StandardScaler().fit(X_text_raw[ca_train_idx])
scaler_graph_ca = StandardScaler().fit(X_graph_raw[ca_train_idx])

X_text_ca_all_std  = scaler_text_ca.transform(X_text_raw)
X_graph_ca_all_std = scaler_graph_ca.transform(X_graph_raw)

X_text_ca_train = X_text_ca_all_std[ca_train_idx]
X_graph_ca_train= X_graph_ca_all_std[ca_train_idx]
y_ca_train      = y_all[ca_train_idx]

X_text_ca_val   = X_text_ca_all_std[ca_val_idx]
X_graph_ca_val  = X_graph_ca_all_std[ca_val_idx]
y_ca_val        = y_all[ca_val_idx]

print("X_text_ca_train 形状:", X_text_ca_train.shape)
print("X_graph_ca_train形状:", X_graph_ca_train.shape)
print("X_text_ca_val   形状:", X_text_ca_val.shape)
print("X_graph_ca_val  形状:", X_graph_ca_val.shape)

# 3) 构建 DataLoader
batch_size = 64

ds_ca_tr  = PairDataset(X_text_ca_train, X_graph_ca_train, y_ca_train)
ds_ca_val = PairDataset(X_text_ca_val,  X_graph_ca_val,  y_ca_val)

dl_ca_tr  = DataLoader(ds_ca_tr,  batch_size=batch_size, shuffle=True,  drop_last=False)
dl_ca_val = DataLoader(ds_ca_val, batch_size=batch_size, shuffle=False, drop_last=False)

# 4) 定义 Cross-Attn 模型
dim_a = X_text_ca_train.shape[1]
dim_b = X_graph_ca_train.shape[1]

model_ca = CrossAttnWithHead(
    dim_a=dim_a,
    dim_b=dim_b,
    hidden_dim=256,
    num_heads=4,
    mlp_hidden=512,
    dropout=0.1,
).to(device)

loss_fn = nn.L1Loss()
optimizer = torch.optim.AdamW(
    model_ca.parameters(),
    lr=5e-4,
    weight_decay=1e-4,
)

max_epochs = 80
patience   = 10
best_val_mae = float("inf")
best_state_dict = None
best_epoch = -1
epochs_no_improve = 0

history_ca = {"train_mae": [], "val_mae": []}

# 5) 训练（用 val 做 early stopping）
for epoch in range(1, max_epochs + 1):
    model_ca.train()
    train_abs_err = []

    for X1_b, X2_b, y_b in dl_ca_tr:
        X1_b = X1_b.to(device)
        X2_b = X2_b.to(device)
        y_b  = y_b.to(device)

        optimizer.zero_grad()
        y_hat, fused = model_ca(X1_b, X2_b)
        loss = loss_fn(y_hat, y_b)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_ca.parameters(), max_norm=1.0)
        optimizer.step()

        train_abs_err.append(torch.abs(y_hat.detach() - y_b).cpu().numpy())

    train_mae = float(np.mean(np.concatenate(train_abs_err)))
    history_ca["train_mae"].append(train_mae)

    # 验证
    model_ca.eval()
    val_abs_err = []
    with torch.no_grad():
        for X1_b, X2_b, y_b in dl_ca_val:
            X1_b = X1_b.to(device)
            X2_b = X2_b.to(device)
            y_b  = y_b.to(device)

            y_hat, fused = model_ca(X1_b, X2_b)
            val_abs_err.append(torch.abs(y_hat - y_b).cpu().numpy())

    val_mae = float(np.mean(np.concatenate(val_abs_err)))
    history_ca["val_mae"].append(val_mae)

    print(f"[Cross-Attn T+G] Epoch {epoch:03d} | train MAE = {train_mae:.4f}, val MAE = {val_mae:.4f}")

    if val_mae < best_val_mae - 1e-4:
        best_val_mae = val_mae
        best_state_dict = {k: v.cpu().clone() for k, v in model_ca.state_dict().items()}
        best_epoch = epoch
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"[Cross-Attn T+G] Early stopping at epoch {epoch}, best_epoch = {best_epoch}")
            break

# 6) 加载最佳权重，抽出 encoder，对“所有样本”生成 fused_all_TG
if best_state_dict is not None:
    model_ca.load_state_dict(best_state_dict)
model_ca.to(device)
model_ca.eval()

encoder_tg = model_ca.encoder  # 只取 encoder
encoder_tg.eval().to(device)

ds_all_ca = PairDataset(X_text_ca_all_std, X_graph_ca_all_std, y_all)
dl_all_ca = DataLoader(ds_all_ca, batch_size=batch_size, shuffle=False)

fused_all_list = []
with torch.no_grad():
    for X1_b, X2_b, y_b in dl_all_ca:
        X1_b = X1_b.to(device)
        X2_b = X2_b.to(device)
        fused = encoder_tg(X1_b, X2_b)  # (B, hidden_dim)
        fused_all_list.append(fused.cpu().numpy())

fused_all_TG = np.concatenate(fused_all_list, axis=0)  # (N, hidden_dim)
print("fused_all_TG 形状:", fused_all_TG.shape)

# 保存 Cross-Attn 模型 & fused_all
torch.save(
    {
        "state_dict": best_state_dict,
        "config": {
            "dim_a": dim_a,
            "dim_b": dim_b,
            "hidden_dim": 256,
            "num_heads": 4,
            "mlp_hidden": 512,
            "dropout": 0.1,
        },
    },
    MODELS_CA / "crossattn_T_G_best.pt"
)

np.save(OUT_TG / "fused_all_T_G.npy", fused_all_TG)
np.save(OUT_TG / "row_id_all_T_G.npy", rowid_all)
np.save(OUT_TG / "y_all_T_G.npy",      y_all)
np.save(OUT_TG / "groups_all_T_G.npy", groups_all)

with open(OUT_TG / "crossattn_T_G_history.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_epoch": best_epoch,
            "best_val_mae": float(best_val_mae),
            "history": history_ca,
            "n_all": int(len(y_all)),
            "n_train": int(len(ca_train_idx)),
            "n_val": int(len(ca_val_idx)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ Cross-Attn (Text+Graph) 训练完成，fused_all_TG 已保存。")


Cross-Attn train 样本数: 2581
Cross-Attn val   样本数: 632
X_text_ca_train 形状: (2581, 768)
X_graph_ca_train形状: (2581, 256)
X_text_ca_val   形状: (632, 768)
X_graph_ca_val  形状: (632, 256)
[Cross-Attn T+G] Epoch 001 | train MAE = 0.6559, val MAE = 0.5221
[Cross-Attn T+G] Epoch 002 | train MAE = 0.5275, val MAE = 0.5327
[Cross-Attn T+G] Epoch 003 | train MAE = 0.5052, val MAE = 0.5506
[Cross-Attn T+G] Epoch 004 | train MAE = 0.4947, val MAE = 0.5321
[Cross-Attn T+G] Epoch 005 | train MAE = 0.4934, val MAE = 0.5146
[Cross-Attn T+G] Epoch 006 | train MAE = 0.4810, val MAE = 0.5168
[Cross-Attn T+G] Epoch 007 | train MAE = 0.4804, val MAE = 0.5463
[Cross-Attn T+G] Epoch 008 | train MAE = 0.4691, val MAE = 0.5247
[Cross-Attn T+G] Epoch 009 | train MAE = 0.4659, val MAE = 0.5351
[Cross-Attn T+G] Epoch 010 | train MAE = 0.4538, val MAE = 0.5423
[Cross-Attn T+G] Epoch 011 | train MAE = 0.4576, val MAE = 0.5026
[Cross-Attn T+G] Epoch 012 | train MAE = 0.4547, val MAE = 0.5320
[Cross-Attn T+G] Epoch 013 | 

In [7]:
# ===== Cell 7: RF 端 8:2 划分（按 SMILES 分组，但独立于 Cross-Attn） =====

# 1) 先在 RF 端自己的 80% 上拟合 Duration 的 scaler
gss_rf = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2027)  # 不同 random_state
N = len(y_all)
rf_train_idx, rf_test_idx = next(
    gss_rf.split(np.zeros(N), y_all, groups_all)
)
rf_train_idx = np.array(rf_train_idx, dtype=np.int64)
rf_test_idx  = np.array(rf_test_idx, dtype=np.int64)

print("RF train 样本数:", len(rf_train_idx))
print("RF test  样本数:", len(rf_test_idx))

np.save(OUT_TG / "rf_train_idx_T_G.npy", rf_train_idx)
np.save(OUT_TG / "rf_test_idx_T_G.npy",  rf_test_idx)

# RF 的数值特征：Duration 标准化（只在 RF 的 train80% 上 fit）
scaler_dur_rf = StandardScaler().fit(dur_raw[rf_train_idx])
dur_all_std_rf = scaler_dur_rf.transform(dur_raw)

print("dur_all_std_rf 形状:", dur_all_std_rf.shape)

# 2) 构造 RF 用的最终特征： [ fused_all_TG , dur_all_std_rf , cat_all ]
X_all_RF = np.concatenate(
    [fused_all_TG, dur_all_std_rf, cat_all],
    axis=1
)
print("X_all_RF 形状:", X_all_RF.shape)

X_train_RF = X_all_RF[rf_train_idx]
y_train_RF = y_all[rf_train_idx]
groups_train_RF = groups_all[rf_train_idx]

X_test_RF  = X_all_RF[rf_test_idx]
y_test_RF  = y_all[rf_test_idx]

print("X_train_RF 形状:", X_train_RF.shape)
print("X_test_RF  形状:", X_test_RF.shape)


RF train 样本数: 2553
RF test  样本数: 660
dur_all_std_rf 形状: (3213, 1)
X_all_RF 形状: (3213, 260)
X_train_RF 形状: (2553, 260)
X_test_RF  形状: (660, 260)


In [8]:
# ===== Cell 8: RF 十折 CV + RandomizedSearchCV 在 RF 80% train 上进行 =====

# 1) RF 超参空间
param_distributions_rf = {
    "n_estimators":      [200, 300, 500, 800, 1000],
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4],
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

rf_base = RandomForestRegressor(
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)

cv_inner = GroupKFold(n_splits=10)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_distributions_rf,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    cv=cv_inner,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("开始在 RF 80% train 上做十折 GroupKFold 随机超参搜索（T+G fused + meta）...")
rf_search.fit(X_train_RF, y_train_RF, groups=groups_train_RF)

best_params_rf = rf_search.best_params_
best_score_rf  = rf_search.best_score_

print("\n[T+G+meta→RF] 最优超参（基于 RF 80% train 十折CV）：")
print(best_params_rf)
print("最优 CV 分数 (neg MAE):", best_score_rf)

# 2) 用最优超参在 RF train80% 上重训 RF
rf_final = RandomForestRegressor(
    **best_params_rf,
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)
rf_final.fit(X_train_RF, y_train_RF)

# 3) 评估：train80% & test20%
y_train_pred_RF = rf_final.predict(X_train_RF)
y_test_pred_RF  = rf_final.predict(X_test_RF)

metrics_train_RF = compute_regression_metrics(y_train_RF, y_train_pred_RF)
metrics_test_RF  = compute_regression_metrics(y_test_RF,  y_test_pred_RF)

print("\n===== [T+G fused + duration/meta → RF] RF train80% 指标 =====")
for k, v in metrics_train_RF.items():
    print(f"{k}: {v:.4f}")

print("\n===== [T+G fused + duration/meta → RF] RF test20% 指标 =====")
for k, v in metrics_test_RF.items():
    print(f"{k}: {v:.4f}")

# 4) 保存 RF 模型 & 结果
joblib.dump(
    {
        "model": rf_final,
        "scaler_text_ca": scaler_text_ca,
        "scaler_graph_ca": scaler_graph_ca,
        "encoder_state_dict": best_state_dict,
        "scaler_dur_rf": scaler_dur_rf,
        "cat_feature_names": cat_feature_names,
        "config": {
            "hidden_dim": 256,
            "GLOBAL_SEED": int(GLOBAL_SEED),
            "param_distributions_rf": param_distributions_rf,
        },
    },
    MODELS_RF / "rf_T_G_meta_from_CA.joblib"
)

np.save(OUT_TG / "X_train_RF_T_G.npy", X_train_RF)
np.save(OUT_TG / "X_test_RF_T_G.npy",  X_test_RF)
np.save(OUT_TG / "y_train_RF_T_G.npy", y_train_RF)
np.save(OUT_TG / "y_test_RF_T_G.npy",  y_test_RF)
np.save(OUT_TG / "y_train_pred_RF_T_G.npy", y_train_pred_RF)
np.save(OUT_TG / "y_test_pred_RF_T_G.npy",  y_test_pred_RF)

with open(OUT_TG / "metrics_RF_T_G_meta_from_CA.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params_rf": best_params_rf,
            "best_score_cv_neg_mae": float(best_score_rf),
            "train80_metrics": metrics_train_RF,
            "test20_metrics": metrics_test_RF,
            "n_all": int(len(y_all)),
            "n_train80_rf": int(len(y_train_RF)),
            "n_test20_rf": int(len(y_test_RF)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ RF 训练 & 评估完成，模型与指标已保存。")


开始在 RF 80% train 上做十折 GroupKFold 随机超参搜索（T+G fused + meta）...
Fitting 10 folds for each of 30 candidates, totalling 300 fits

[T+G+meta→RF] 最优超参（基于 RF 80% train 十折CV）：
{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 0.3, 'max_depth': 30}
最优 CV 分数 (neg MAE): -0.40945305363846163

===== [T+G fused + duration/meta → RF] RF train80% 指标 =====
MAE: 0.2066
RMSE: 0.3128
R2: 0.9294
Pearson_r: 0.9659

===== [T+G fused + duration/meta → RF] RF test20% 指标 =====
MAE: 0.3938
RMSE: 0.5549
R2: 0.7497
Pearson_r: 0.8671

✅ RF 训练 & 评估完成，模型与指标已保存。
