In [1]:
# ===== Cell 1: 导入 & 工具函数 =====
from pathlib import Path
import numpy as np
import pandas as pd
import json
import pickle

from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

def compute_metrics(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return {
        "r2":   float(r2_score(y_true, y_pred)),
        "mae":  float(mean_absolute_error(y_true, y_pred)),
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "r":    float(pearsonr(y_true, y_pred)[0]),
    }

def np_encoder(o):
    if isinstance(o, (np.integer,)):  return int(o)
    if isinstance(o, (np.floating,)): return float(o)
    if isinstance(o, np.ndarray):     return o.tolist()
    raise TypeError


In [2]:
# ===== Cell 2: 读原始数据 + 构建 log10(mg/L) 标签 =====

ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

DATA_PATH  = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")
SMILES_COL = "SMILES_Canonical_RDKit"

LABEL_RAW  = "mgperL"       # 原始浓度列
LABEL_LOG  = "mgperL_log"   # 我们现在自己构建

df = pd.read_excel(DATA_PATH, engine="openpyxl")
df = df.reset_index(drop=True)

# 原始浓度转成数值
df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")

# 只对 >0 的样本做 log10，其他设为 NaN
mask_pos = df[LABEL_RAW] > 0
df[LABEL_LOG] = np.where(mask_pos, np.log10(df[LABEL_RAW]), np.nan)

print(df[[LABEL_RAW, LABEL_LOG]].head())

y_all_full      = df[LABEL_LOG].values          # log10(mg/L)
groups_all_full = df[SMILES_COL].astype(str).values

print("总样本数:", len(df))
print("有效标签数(非 NaN):", np.isfinite(y_all_full).sum())


   mgperL  mgperL_log
0     1.3    0.113943
1     2.5    0.397940
2    40.8    1.610660
3     1.9    0.278754
4     0.6   -0.221849
总样本数: 3620
有效标签数(非 NaN): 3620


In [3]:
# ===== Cell 3: 加载 Text / Graph / PhysChem 嵌入，并按 df 行号对齐 =====

# ---- Text 端 CLS embedding ----
SMILES_OUT_DIR = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_PATH  = SMILES_OUT_DIR / "reg_smiles_cls_embeddings_all.npy"  # 按你实际文件名改

X_text_all = np.load(TEXT_EMB_PATH)   # (N_df, d_text=768)
rowid_text = np.arange(len(df), dtype=int)

print("X_text_all 形状:", X_text_all.shape)

# ---- Graph 端 GNN embedding ----
GRAPH_OUT_DIR    = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_OUT_DIR / "reg_graph_embeddings.npy"          # (N_graph, d_graph)
GRAPH_ROWID_PATH = GRAPH_OUT_DIR / "row_id_graph_for_emb.npy"          # df 行号

X_graph_raw = np.load(GRAPH_EMB_PATH)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)

print("X_graph_raw 形状:", X_graph_raw.shape)
print("rowid_graph 范围:", rowid_graph.min(), "→", rowid_graph.max())

# ---- PhysChem 端 MLP embedding ----
PHY_OUT_DIR    = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_OUT_DIR / "emb_physchem_mlp_all.npy"  # descMLP 保存的 embedding
PHY_ROWID_PATH = PHY_OUT_DIR / "row_id_clean.npy"          # 对应 df 行号

X_phys_raw = np.load(PHY_EMB_PATH)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)

print("X_phys_raw 形状:", X_phys_raw.shape)
print("rowid_phys 范围:", rowid_phys.min(), "→", rowid_phys.max())

# ---- 三模态 row_id 交集 ----
ids_text  = set(rowid_text.tolist())
ids_graph = set(rowid_graph.tolist())
ids_phys  = set(rowid_phys.tolist())

ids_inter = sorted(list(ids_text & ids_graph & ids_phys))
print("三模态交集样本数:", len(ids_inter))

idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}
idx_map_phys  = {rid: i for i, rid in enumerate(rowid_phys)}

X_text_list, X_graph_list, X_phys_list = [], [], []
y_list, groups_list, rid_list = [], [], []

for rid in ids_inter:
    y_val = y_all_full[rid]
    if not np.isfinite(y_val):
        continue  # 把 mgperL<=0 或缺失的样本丢掉

    X_text_list.append(X_text_all[rid])
    X_graph_list.append(X_graph_raw[idx_map_graph[rid]])
    X_phys_list.append(X_phys_raw[idx_map_phys[rid]])

    y_list.append(y_all_full[rid])
    groups_list.append(groups_all_full[rid])
    rid_list.append(rid)

X_text = np.stack(X_text_list, axis=0)   # (N, d_text)
X_graph= np.stack(X_graph_list, axis=0)  # (N, d_graph)
X_phys = np.stack(X_phys_list, axis=0)   # (N, d_phys)
y_all  = np.array(y_list, dtype=float)   # (N,)
groups = np.array(groups_list)           # (N,)
rowid  = np.array(rid_list, dtype=int)   # (N,)

print("对齐后 X_text:", X_text.shape)
print("对齐后 X_graph:", X_graph.shape)
print("对齐后 X_phys:", X_phys.shape)
print("y_all 形状:", y_all.shape)


X_text_all 形状: (3620, 768)
X_graph_raw 形状: (3213, 256)
rowid_graph 范围: 1 → 3619
X_phys_raw 形状: (3406, 64)
rowid_phys 范围: 0 → 3619
三模态交集样本数: 3103
对齐后 X_text: (3103, 768)
对齐后 X_graph: (3103, 256)
对齐后 X_phys: (3103, 64)
y_all 形状: (3103,)


In [4]:
# ===== Cell 4: 定义三模态 Cross-Attn 模型 =====
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用设备:", device)

class TriModalDataset(Dataset):
    def __init__(self, X_text, X_graph, X_phys, y):
        self.X_text  = torch.from_numpy(X_text).float()
        self.X_graph = torch.from_numpy(X_graph).float()
        self.X_phys  = torch.from_numpy(X_phys).float()
        self.y       = torch.from_numpy(y).float()

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return (
            self.X_text[idx],
            self.X_graph[idx],
            self.X_phys[idx],
            self.y[idx],
        )

class TriCrossAttnRegressor(nn.Module):
    """
    三模态 Cross-Attn：
    - 输入: text_emb (B, d_t), graph_emb (B, d_g), phys_emb (B, d_p)
    - 先线性投影到同一维度 hidden_dim
    - 堆成 3 个 token 做 multi-head self-attention
    - 对 3 个 token 做平均池化 → fused_emb
    - fused_emb → 小 MLP 输出 y
    - 可返回 fused_emb 用作中期融合 embedding
    """
    def __init__(self, dim_text, dim_graph, dim_phys,
                 hidden_dim=256, num_heads=4, mlp_hidden=512, dropout=0.1):
        super().__init__()
        self.text_proj  = nn.Linear(dim_text,  hidden_dim)
        self.graph_proj = nn.Linear(dim_graph, hidden_dim)
        self.phys_proj  = nn.Linear(dim_phys,  hidden_dim)

        self.attn = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=False,  # (S, B, E)
        )

        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, mlp_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, 1),
        )

    def forward(self, x_text, x_graph, x_phys, return_emb=False, return_attn=False):
        """
        x_text : (B, d_t)
        x_graph: (B, d_g)
        x_phys : (B, d_p)
        """

        t = self.text_proj(x_text)   # (B, H)
        g = self.graph_proj(x_graph) # (B, H)
        p = self.phys_proj(x_phys)   # (B, H)

        # tokens: (S=3, B, H)
        tokens = torch.stack([t, g, p], dim=0)

        # attn_w: (B, 3, 3)  (batch, tgt_len, src_len)
        attn_out, attn_w = self.attn(
            tokens, tokens, tokens,
            need_weights=True,
            average_attn_weights=True
        )

        fused = attn_out.mean(dim=0)           # (B, H)
        y_pred = self.mlp(fused).squeeze(-1)   # (B,)

        if return_attn:
            # 输入 token（=模态）的总贡献：对 query(tgt) 维取平均 -> (B, 3)，且每行和=1
            contrib = attn_w.mean(dim=1)  # (B,3): [text, graph, phys]
            if return_emb:
                return y_pred, fused, contrib, attn_w
            return y_pred, contrib, attn_w

        if return_emb:
            return y_pred, fused
        return y_pred


# 设定 Cross-Attn 超参（可以以后再调）
CA_HIDDEN_DIM   = 256
CA_NUM_HEADS    = 4
CA_MLP_HIDDEN   = 512
CA_DROPOUT      = 0.1
CA_LR           = 5e-4
CA_WEIGHT_DECAY = 1e-4
CA_BATCH_SIZE   = 64
CA_EPOCHS       = 50

torch.manual_seed(GLOBAL_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(GLOBAL_SEED)


使用设备: cuda


In [5]:
# ===== Cell 5: Cross-Attn 8:2 划分 + 训练 =====

from sklearn.model_selection import GroupShuffleSplit

# 1) 8:2 按 SMILES 分组划分（用于训练神经网络）
gss_ca = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2026)
idx_all = np.arange(len(y_all))
train_idx_ca, val_idx_ca = next(gss_ca.split(idx_all, y_all, groups))

train_idx_ca = train_idx_ca.astype(int)
val_idx_ca   = val_idx_ca.astype(int)

print("Cross-Attn train 样本数:", len(train_idx_ca))
print("Cross-Attn val   样本数:", len(val_idx_ca))

X_text_train_ca  = X_text[train_idx_ca]
X_graph_train_ca = X_graph[train_idx_ca]
X_phys_train_ca  = X_phys[train_idx_ca]
y_train_ca       = y_all[train_idx_ca]

X_text_val_ca  = X_text[val_idx_ca]
X_graph_val_ca = X_graph[val_idx_ca]
X_phys_val_ca  = X_phys[val_idx_ca]
y_val_ca       = y_all[val_idx_ca]

train_ds = TriModalDataset(X_text_train_ca, X_graph_train_ca, X_phys_train_ca, y_train_ca)
val_ds   = TriModalDataset(X_text_val_ca,   X_graph_val_ca,   X_phys_val_ca,   y_val_ca)

train_loader = DataLoader(train_ds, batch_size=CA_BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=CA_BATCH_SIZE*2, shuffle=False)

dim_text  = X_text.shape[1]
dim_graph = X_graph.shape[1]
dim_phys  = X_phys.shape[1]

model = TriCrossAttnRegressor(
    dim_text=dim_text,
    dim_graph=dim_graph,
    dim_phys=dim_phys,
    hidden_dim=CA_HIDDEN_DIM,
    num_heads=CA_NUM_HEADS,
    mlp_hidden=CA_MLP_HIDDEN,
    dropout=CA_DROPOUT,
).to(device)

criterion = nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=CA_LR, weight_decay=CA_WEIGHT_DECAY)

best_val_loss = float("inf")
best_state = None

for epoch in range(1, CA_EPOCHS + 1):
    # ---- train ----
    model.train()
    train_loss_sum = 0.0
    for x_t, x_g, x_p, y in train_loader:
        x_t = x_t.to(device)
        x_g = x_g.to(device)
        x_p = x_p.to(device)
        y   = y.to(device)

        optimizer.zero_grad()
        y_pred = model(x_t, x_g, x_p, return_emb=False)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()

        train_loss_sum += loss.item() * y.size(0)

    train_loss = train_loss_sum / len(train_ds)

    # ---- val ----
    model.eval()
    val_loss_sum = 0.0
    with torch.no_grad():
        for x_t, x_g, x_p, y in val_loader:
            x_t = x_t.to(device)
            x_g = x_g.to(device)
            x_p = x_p.to(device)
            y   = y.to(device)

            y_pred = model(x_t, x_g, x_p, return_emb=False)
            loss = criterion(y_pred, y)
            val_loss_sum += loss.item() * y.size(0)

    val_loss = val_loss_sum / len(val_ds)

    print(f"[Epoch {epoch:03d}] train_L1 = {train_loss:.4f}, val_L1 = {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        print(f"  -> 更新 best_state, 当前 best_val_L1 = {best_val_loss:.4f}")

# 加载最佳参数
if best_state is not None:
    model.load_state_dict(best_state)
    print("\n✅ 加载最优验证损失对应的模型参数")


Cross-Attn train 样本数: 2487
Cross-Attn val   样本数: 616
[Epoch 001] train_L1 = 0.6766, val_L1 = 0.5733
  -> 更新 best_state, 当前 best_val_L1 = 0.5733
[Epoch 002] train_L1 = 0.5173, val_L1 = 0.5107
  -> 更新 best_state, 当前 best_val_L1 = 0.5107
[Epoch 003] train_L1 = 0.5040, val_L1 = 0.5394
[Epoch 004] train_L1 = 0.5040, val_L1 = 0.4954
  -> 更新 best_state, 当前 best_val_L1 = 0.4954
[Epoch 005] train_L1 = 0.4868, val_L1 = 0.5026
[Epoch 006] train_L1 = 0.4865, val_L1 = 0.4785
  -> 更新 best_state, 当前 best_val_L1 = 0.4785
[Epoch 007] train_L1 = 0.4651, val_L1 = 0.4930
[Epoch 008] train_L1 = 0.4750, val_L1 = 0.4938
[Epoch 009] train_L1 = 0.4698, val_L1 = 0.4990
[Epoch 010] train_L1 = 0.4636, val_L1 = 0.4853
[Epoch 011] train_L1 = 0.4552, val_L1 = 0.5110
[Epoch 012] train_L1 = 0.4692, val_L1 = 0.5025
[Epoch 013] train_L1 = 0.4649, val_L1 = 0.4858
[Epoch 014] train_L1 = 0.4448, val_L1 = 0.4929
[Epoch 015] train_L1 = 0.4364, val_L1 = 0.4760
  -> 更新 best_state, 当前 best_val_L1 = 0.4760
[Epoch 016] train_L1 =

In [6]:
# ===== Cell 6: 提取全体样本的三模态融合 embedding =====
model.eval()

fused_list = []
y_pred_all_list = []

with torch.no_grad():
    BATCH = 256
    N_tot = len(y_all)
    for start in range(0, N_tot, BATCH):
        end = min(N_tot, start + BATCH)
        x_t = torch.from_numpy(X_text[start:end]).float().to(device)
        x_g = torch.from_numpy(X_graph[start:end]).float().to(device)
        x_p = torch.from_numpy(X_phys[start:end]).float().to(device)

        y_pred_batch, fused_batch = model(x_t, x_g, x_p, return_emb=True)
        fused_list.append(fused_batch.cpu().numpy())
        y_pred_all_list.append(y_pred_batch.cpu().numpy())

fused_all   = np.concatenate(fused_list, axis=0)       # (N, hidden_dim)
y_pred_all_ca = np.concatenate(y_pred_all_list, axis=0)  # 网络直接预测的 y (没用也可以保存一下)

print("fused_all 形状:", fused_all.shape)
print("Cross-Attn 直接预测 y 形状:", y_pred_all_ca.shape)

MID_OUT_DIR = ROOT_MULTI / "mid_fusion_TGP"
MID_OUT_DIR.mkdir(parents=True, exist_ok=True)

np.save(MID_OUT_DIR / "fused_emb_TGP_all.npy", fused_all.astype(np.float32))
np.save(MID_OUT_DIR / "rowid_TGP.npy", rowid.astype(np.int64))
np.save(MID_OUT_DIR / "y_all_TGP.npy", y_all.astype(np.float32))
np.save(MID_OUT_DIR / "groups_TGP.npy", groups.astype("U"))

# 保存 Cross-Attn 端预测（可选）
np.save(MID_OUT_DIR / "ca_y_pred_all_TGP.npy", y_pred_all_ca.astype(np.float32))
np.save(MID_OUT_DIR / "train_idx_ca.npy", train_idx_ca.astype(np.int64))
np.save(MID_OUT_DIR / "val_idx_ca.npy",   val_idx_ca.astype(np.int64))

# 保存模型参数
torch.save(model.state_dict(), MID_OUT_DIR / "tricrossattn_TGP_state_dict.pt")

print("\n✅ 三模态 Cross-Attn 融合 embedding 已保存到:", MID_OUT_DIR)


fused_all 形状: (3103, 256)
Cross-Attn 直接预测 y 形状: (3103,)

✅ 三模态 Cross-Attn 融合 embedding 已保存到: /root/Invertebrates_EC50_multi_fusion/mid_fusion_TGP


In [7]:
def collect_attn_contrib(model, X_text, X_graph, X_phys, device, batch_size=256):
    model.eval()
    contrib_all = []

    n = len(X_text)
    with torch.no_grad():
        for s in range(0, n, batch_size):
            e = min(n, s + batch_size)

            xt = torch.from_numpy(X_text[s:e]).float().to(device)
            xg = torch.from_numpy(X_graph[s:e]).float().to(device)
            xp = torch.from_numpy(X_phys[s:e]).float().to(device)

            # y_pred, fused, contrib, attn_w
            _, _, contrib, _ = model(xt, xg, xp, return_emb=True, return_attn=True)
            contrib_all.append(contrib.detach().cpu().numpy())

    contrib_all = np.concatenate(contrib_all, axis=0)  # (N,3)
    return contrib_all

# ===== 1) 全体样本贡献 =====
contrib_all = collect_attn_contrib(model, X_text, X_graph, X_phys, device, batch_size=256)

mean_all = contrib_all.mean(axis=0)
std_all  = contrib_all.std(axis=0)

attn_report_all = {
    "scope": "ALL",
    "mean": {"text": float(mean_all[0]), "graph": float(mean_all[1]), "phys": float(mean_all[2])},
    "std":  {"text": float(std_all[0]),  "graph": float(std_all[1]),  "phys": float(std_all[2])},
}

np.save(MID_OUT_DIR / "mid_attn_contrib_TGP_all.npy", contrib_all.astype(np.float32))
with open(MID_OUT_DIR / "mid_attn_contrib_TGP_all_report.json", "w", encoding="utf-8") as f:
    json.dump(attn_report_all, f, ensure_ascii=False, indent=2)

print("✅ saved:", MID_OUT_DIR / "mid_attn_contrib_TGP_all.npy")
print("✅ saved:", MID_OUT_DIR / "mid_attn_contrib_TGP_all_report.json")
print(attn_report_all)

# ===== 2) 验证集贡献（推荐看这个，更贴近泛化贡献）=====
idx = val_idx_ca  # 你 notebook 里的 val index
contrib_val = collect_attn_contrib(model, X_text[idx], X_graph[idx], X_phys[idx], device, batch_size=256)

mean_val = contrib_val.mean(axis=0)
std_val  = contrib_val.std(axis=0)

attn_report_val = {
    "scope": "VAL",
    "mean": {"text": float(mean_val[0]), "graph": float(mean_val[1]), "phys": float(mean_val[2])},
    "std":  {"text": float(std_val[0]),  "graph": float(std_val[1]),  "phys": float(std_val[2])},
}

np.save(MID_OUT_DIR / "mid_attn_contrib_TGP_val.npy", contrib_val.astype(np.float32))
with open(MID_OUT_DIR / "mid_attn_contrib_TGP_val_report.json", "w", encoding="utf-8") as f:
    json.dump(attn_report_val, f, ensure_ascii=False, indent=2)

print("✅ saved:", MID_OUT_DIR / "mid_attn_contrib_TGP_val.npy")
print("✅ saved:", MID_OUT_DIR / "mid_attn_contrib_TGP_val_report.json")
print(attn_report_val)


✅ saved: /root/Invertebrates_EC50_multi_fusion/mid_fusion_TGP/mid_attn_contrib_TGP_all.npy
✅ saved: /root/Invertebrates_EC50_multi_fusion/mid_fusion_TGP/mid_attn_contrib_TGP_all_report.json
{'scope': 'ALL', 'mean': {'text': 0.18062035739421844, 'graph': 0.24020308256149292, 'phys': 0.5791768431663513}, 'std': {'text': 0.08111577481031418, 'graph': 0.10781783610582352, 'phys': 0.1305657923221588}}
✅ saved: /root/Invertebrates_EC50_multi_fusion/mid_fusion_TGP/mid_attn_contrib_TGP_val.npy
✅ saved: /root/Invertebrates_EC50_multi_fusion/mid_fusion_TGP/mid_attn_contrib_TGP_val_report.json
{'scope': 'VAL', 'mean': {'text': 0.18288934230804443, 'graph': 0.24668678641319275, 'phys': 0.57042396068573}, 'std': {'text': 0.07997278869152069, 'graph': 0.10985127091407776, 'phys': 0.13516274094581604}}


In [7]:
# ===== Cell 7: 构建 meta 特征，用于 RF 中期融合 =====

NUM_META_COLS_CANDIDATE = ["Duration_Value(hour)"]   # 数值型
CAT_META_COLS_CANDIDATE = ["Effect", "Endpoint"]     # 类别型

NUM_META_COLS = [c for c in NUM_META_COLS_CANDIDATE if c in df.columns]
CAT_META_COLS = [c for c in CAT_META_COLS_CANDIDATE if c in df.columns]

print("数值型 meta 列:", NUM_META_COLS)
print("类别型 meta 列:", CAT_META_COLS)

df_meta = df.loc[rowid].copy().reset_index(drop=True)

# 数值型
if NUM_META_COLS:
    scaler_meta = StandardScaler()
    X_num = scaler_meta.fit_transform(df_meta[NUM_META_COLS].values)
else:
    X_num = np.zeros((len(df_meta), 0), dtype=np.float32)

# 类别型
if CAT_META_COLS:
    ohe_meta = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat = ohe_meta.fit_transform(df_meta[CAT_META_COLS].astype(str))
else:
    X_cat = np.zeros((len(df_meta), 0), dtype=np.float32)

X_meta = np.concatenate([X_num, X_cat], axis=1)
print("X_meta 形状:", X_meta.shape)

# 也把 meta 相关变换保存，方便后续复用
with open(MID_OUT_DIR / "meta_scaler.pkl", "wb") as f:
    pickle.dump(scaler_meta if NUM_META_COLS else None, f)

with open(MID_OUT_DIR / "meta_ohe.pkl", "wb") as f:
    pickle.dump(ohe_meta if CAT_META_COLS else None, f)

np.save(MID_OUT_DIR / "X_meta_TGP_all.npy", X_meta.astype(np.float32))


数值型 meta 列: ['Duration_Value(hour)']
类别型 meta 列: ['Effect', 'Endpoint']
X_meta 形状: (3103, 4)


In [8]:
# ===== Cell 8: RF 中期融合 (fused_emb + meta) =====

# 1) RF 自己一套 8:2 划分（按 SMILES 分组，与 Cross-Attn 可以不同）
gss_rf = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2030)
idx_all = np.arange(len(y_all))
train_idx_rf, test_idx_rf = next(gss_rf.split(idx_all, y_all, groups))

train_idx_rf = train_idx_rf.astype(int)
test_idx_rf  = test_idx_rf.astype(int)

print("RF 中期融合 train 样本数:", len(train_idx_rf))
print("RF 中期融合 test  样本数:", len(test_idx_rf))

X_fused_train = fused_all[train_idx_rf]
X_fused_test  = fused_all[test_idx_rf]
X_meta_train  = X_meta[train_idx_rf]
X_meta_test   = X_meta[test_idx_rf]

y_train_rf = y_all[train_idx_rf]
y_test_rf  = y_all[test_idx_rf]
groups_train_rf = groups[train_idx_rf]
rowid_train_rf  = rowid[train_idx_rf]
rowid_test_rf   = rowid[test_idx_rf]

# 中期融合最终特征 = fused_emb + meta
X_train_mid = np.concatenate([X_fused_train, X_meta_train], axis=1)
X_test_mid  = np.concatenate([X_fused_test,  X_meta_test],  axis=1)

print("X_train_mid 形状:", X_train_mid.shape)
print("X_test_mid  形状:", X_test_mid.shape)

# 2) RF 十折 GroupKFold + RandomizedSearchCV（和前面保持一致）
param_distributions_mid = {
    "n_estimators":      randint(200, 1001),
    "max_depth":         [None, 10, 20, 30],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf":  randint(1, 5),
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

base_rf_mid = RandomForestRegressor(
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)

cv_rf = GroupKFold(n_splits=10)
cv_indices_rf = list(cv_rf.split(X_train_mid, y_train_rf, groups_train_rf))

rf_mid_search = RandomizedSearchCV(
    estimator=base_rf_mid,
    param_distributions=param_distributions_mid,
    n_iter=30,
    scoring="r2",
    cv=cv_indices_rf,
    n_jobs=-1,
    verbose=2,
    random_state=GLOBAL_SEED,
)

print("\n==== [Mid T+G+P] RF 十折随机搜索 ====")
rf_mid_search.fit(X_train_mid, y_train_rf, groups=groups_train_rf)

best_params_mid = rf_mid_search.best_params_
best_cv_mid     = rf_mid_search.best_score_

print("\n===== Mid 三模态 RF 最优超参 =====")
print(best_params_mid)
print(f"CV 平均 R^2: {best_cv_mid:.4f}")

# 3) 用最优超参 + 同一套 folds，计算 train OOF 预测
oof_pred_train_mid = np.zeros_like(y_train_rf, dtype=float)

for fold_idx, (tr_idx, val_idx) in enumerate(cv_indices_rf, 1):
    print(f"  -> OOF fold {fold_idx} / {len(cv_indices_rf)}")
    rf_fold = RandomForestRegressor(
        **best_params_mid,
        n_jobs=-1,
        random_state=GLOBAL_SEED + 200 + fold_idx,
    )
    rf_fold.fit(X_train_mid[tr_idx], y_train_rf[tr_idx])
    oof_pred_train_mid[val_idx] = rf_fold.predict(X_train_mid[val_idx])

metrics_oof_mid = compute_metrics(y_train_rf, oof_pred_train_mid)
print("\n===== Mid 三模态 RF：train OOF 表现 =====")
for k, v in metrics_oof_mid.items():
    print(f"{k}: {v:.4f}")

# 4) 在整个 train80% 上拟合最终 RF，并在 test20% 上评估
best_rf_mid = RandomForestRegressor(
    **best_params_mid,
    n_jobs=-1,
    random_state=GLOBAL_SEED + 300,
)
best_rf_mid.fit(X_train_mid, y_train_rf)

y_train_mid_pred = best_rf_mid.predict(X_train_mid)
y_test_mid_pred  = best_rf_mid.predict(X_test_mid)

metrics_train_mid = compute_metrics(y_train_rf, y_train_mid_pred)
metrics_test_mid  = compute_metrics(y_test_rf,  y_test_mid_pred)

print("\n===== Mid 三模态 RF 训练集表现 =====")
for k, v in metrics_train_mid.items():
    print(f"{k}: {v:.4f}")

print("\n===== Mid 三模态 RF 测试集表现（独立 20%）=====")
for k, v in metrics_test_mid.items():
    print(f"{k}: {v:.4f}")


RF 中期融合 train 样本数: 2455
RF 中期融合 test  样本数: 648
X_train_mid 形状: (2455, 260)
X_test_mid  形状: (648, 260)

==== [Mid T+G+P] RF 十折随机搜索 ====
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=9, n_estimators=900; total time= 4.1min
[CV] END max_depth=20, max_features=0.8, min_samples_leaf=2, min_samples_split=9, n_estimators=691; total time=13.8min
[CV] END max_depth=20, max_features=0.3, min_samples_leaf=3, min_samples_split=4, n_estimators=766; total time= 9.6min
[CV] END max_depth=10, max_features=0.8, min_samples_leaf=2, min_samples_split=5, n_estimators=291; total time= 7.1min
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=3, min_samples_split=9, n_estimators=234; total time= 4.1min
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=3, min_samples_split=9, n_estimators=234; total time= 3.9min
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_est