In [1]:
# ===== Cell 1: 导入 & 工具函数 =====
from pathlib import Path
import numpy as np
import pandas as pd
import json
import pickle

from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

def compute_metrics(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return {
        "r2":   float(r2_score(y_true, y_pred)),
        "mae":  float(mean_absolute_error(y_true, y_pred)),
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "r":    float(pearsonr(y_true, y_pred)[0]),
    }

def np_encoder(o):
    if isinstance(o, (np.integer,)):  return int(o)
    if isinstance(o, (np.floating,)): return float(o)
    if isinstance(o, np.ndarray):     return o.tolist()
    raise TypeError


In [2]:
# ===== Cell 2: 基本路径 & df（用于取 SMILES 作为分组） =====
ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

SMILES_OUT_DIR = ROOT_MULTI / "SMILES" / "smiles_outputs"
GRAPH_OUT_DIR  = ROOT_MULTI / "graph"  / "graph_outputs"
PHY_OUT_DIR    = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"

DATA_PATH  = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")
SMILES_COL = "SMILES_Canonical_RDKit"

df = pd.read_excel(DATA_PATH, engine="openpyxl").reset_index(drop=True)
groups_all = df[SMILES_COL].astype(str).values

print("df 形状:", df.shape)
print("示例 SMILES:", df[SMILES_COL].head())


df 形状: (3620, 34)
示例 SMILES: 0          [Cl-].[Cl-].[Zn+2]
1    O=S(=O)([O-])[O-].[Zn+2]
2          [Cl-].[Cl-].[Pb+2]
3    O=S(=O)([O-])[O-].[Cu+2]
4    O=S(=O)([O-])[O-].[Cu+2]
Name: SMILES_Canonical_RDKit, dtype: object


In [3]:
# ===== Cell 3: 加载 Text / Graph / Phys 单模态 RF 结果，并对齐交集 =====

# ---------- Text ----------
y_train_T       = np.load(SMILES_OUT_DIR / "rf_text_y_train.npy")
y_test_T        = np.load(SMILES_OUT_DIR / "rf_text_y_test.npy")
oof_T_train     = np.load(SMILES_OUT_DIR / "rf_text_oof_pred_train.npy")
y_pred_T_test   = np.load(SMILES_OUT_DIR / "rf_text_y_pred_test.npy")
train_idx_T     = np.load(SMILES_OUT_DIR / "rf_text_train_idx.npy")   # df 行号
test_idx_T      = np.load(SMILES_OUT_DIR / "rf_text_test_idx.npy")

print("Text: train / test:", len(y_train_T), len(y_test_T))

# ---------- Graph ----------
y_train_G       = np.load(GRAPH_OUT_DIR / "rf_graph_y_train.npy")
y_test_G        = np.load(GRAPH_OUT_DIR / "rf_graph_y_test.npy")
oof_G_train     = np.load(GRAPH_OUT_DIR / "rf_graph_oof_pred_train.npy")
y_pred_G_test   = np.load(GRAPH_OUT_DIR / "rf_graph_y_pred_test.npy")
train_idx_G     = np.load(GRAPH_OUT_DIR / "rf_graph_train_idx.npy")
test_idx_G      = np.load(GRAPH_OUT_DIR / "rf_graph_test_idx.npy")

print("Graph: train / test:", len(y_train_G), len(y_test_G))

# PhysChem 端路径
PHY_OUT_DIR      = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"

y_train_P        = np.load(PHY_OUT_DIR / "rf_phys_y_train.npy")
y_test_P         = np.load(PHY_OUT_DIR / "rf_phys_y_test.npy")

# 这里改名：和 Text / Graph 保持风格一致
oof_P_train      = np.load(PHY_OUT_DIR / "rf_phys_oof_pred_train.npy")
y_pred_P_test    = np.load(PHY_OUT_DIR / "rf_phys_y_pred_test.npy")

train_idx_P      = np.load(PHY_OUT_DIR / "rf_phys_train_idx.npy")
test_idx_P       = np.load(PHY_OUT_DIR / "rf_phys_test_idx.npy")

print("PhysChem: train / test:", len(y_train_P), len(y_test_P))

# ---------- 只保留三端都存在的样本（train / test 各自求交集） ----------
train_idx_inter = np.intersect1d(np.intersect1d(train_idx_T, train_idx_G), train_idx_P)
test_idx_inter  = np.intersect1d(np.intersect1d(test_idx_T,  test_idx_G),  test_idx_P)

print("\n三模态 train 交集样本数:", len(train_idx_inter))
print("三模态 test  交集样本数:", len(test_idx_inter))

# 构建 “df行号 -> 单模态 train/test 位置” 的映射字典
pos_T_train = {idx: i for i, idx in enumerate(train_idx_T)}
pos_G_train = {idx: i for i, idx in enumerate(train_idx_G)}
pos_P_train = {idx: i for i, idx in enumerate(train_idx_P)}

pos_T_test  = {idx: i for i, idx in enumerate(test_idx_T)}
pos_G_test  = {idx: i for i, idx in enumerate(test_idx_G)}
pos_P_test  = {idx: i for i, idx in enumerate(test_idx_P)}


Text: train / test: 2889 731
Graph: train / test: 2581 632
PhysChem: train / test: 2690 716

三模态 train 交集样本数: 1612
三模态 test  交集样本数: 25


In [4]:
# ===== Cell 4: 构建 Late T+G+P 的 meta 特征 & 标签 =====

# ---- train 部分：用 OOF 作为特征 ----
oof_T_used, oof_G_used, oof_P_used = [], [], []
y_train_used = []

for rid in train_idx_inter:
    # 三个模态在各自 train 里的位置
    iT = pos_T_train[rid]
    iG = pos_G_train[rid]
    iP = pos_P_train[rid]

    oof_T_used.append(oof_T_train[iT])
    oof_G_used.append(oof_G_train[iG])
    oof_P_used.append(oof_P_train[iP])

    # 标签随便选一个模态的 y_train（应该完全一致）
    y_train_used.append(y_train_T[iT])

oof_T_used   = np.array(oof_T_used, dtype=float)
oof_G_used   = np.array(oof_G_used, dtype=float)
oof_P_used   = np.array(oof_P_used, dtype=float)
y_train_used = np.array(y_train_used, dtype=float)

X_meta_train = np.column_stack([oof_T_used, oof_G_used, oof_P_used])
print("X_meta_train 形状:", X_meta_train.shape)

# ---- test 部分：用各自 test 集上的预测作为特征 ----
pred_T_used, pred_G_used, pred_P_used = [], [], []
y_test_used = []

for rid in test_idx_inter:
    jT = pos_T_test[rid]
    jG = pos_G_test[rid]
    jP = pos_P_test[rid]

    pred_T_used.append(y_pred_T_test[jT])
    pred_G_used.append(y_pred_G_test[jG])
    pred_P_used.append(y_pred_P_test[jP])

    y_test_used.append(y_test_T[jT])

pred_T_used = np.array(pred_T_used, dtype=float)
pred_G_used = np.array(pred_G_used, dtype=float)
pred_P_used = np.array(pred_P_used, dtype=float)
y_test_used = np.array(y_test_used, dtype=float)

X_meta_test = np.column_stack([pred_T_used, pred_G_used, pred_P_used])
print("X_meta_test 形状:", X_meta_test.shape)

print("\nLate T+G+P meta-level train/test 样本数:", len(y_train_used), len(y_test_used))


X_meta_train 形状: (1612, 3)
X_meta_test 形状: (25, 3)

Late T+G+P meta-level train/test 样本数: 1612 25


In [5]:
# ===== Cell 5: 为 meta-RF 构造 SMILES 分组标签 =====

groups_train_meta = groups_all[train_idx_inter]
groups_test_meta  = groups_all[test_idx_inter]

print("示例 groups_train_meta 前 5 个:", groups_train_meta[:5])


示例 groups_train_meta 前 5 个: ['O=S(=O)([O-])[O-].[Zn+2]' '[Cl-].[Cl-].[Pb+2]'
 '[Cr].[Cr].[K+].[K+].[O-].[O-].[O].[O].[O].[O].[O]'
 'O=S(=O)([O-])[O-].[Fe+2]' '[Ca+2].[O-]Cl.[O-]Cl']


In [6]:
# ===== Cell 6: 训练 Late T+G+P 的 meta-RF =====

# 1) 超参数空间
param_distributions_meta = {
    "n_estimators":      randint(200, 1001),
    "max_depth":         [None, 5, 10, 20],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf":  randint(1, 5),
    "max_features":      ["sqrt", "log2", 0.5, 0.8],
}

rf_meta_base = RandomForestRegressor(
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)

# 2) 10 折 GroupKFold
cv_meta = GroupKFold(n_splits=10)
cv_indices_meta = list(cv_meta.split(X_meta_train, y_train_used, groups_train_meta))

rf_meta_search = RandomizedSearchCV(
    estimator=rf_meta_base,
    param_distributions=param_distributions_meta,
    n_iter=30,
    scoring="r2",
    cv=cv_indices_meta,
    n_jobs=-1,
    verbose=2,
    random_state=GLOBAL_SEED,
)

print("\n==== [Late T+G+P] meta-RF 十折随机搜索 ====")
rf_meta_search.fit(X_meta_train, y_train_used, groups=groups_train_meta)

best_params_meta = rf_meta_search.best_params_
best_cv_meta     = rf_meta_search.best_score_

print("\n===== Late 三模态 meta-RF 最优超参 =====")
print(best_params_meta)
print(f"CV 平均 R^2: {best_cv_meta:.4f}")

# 3) 用最优超参 + 同一套 folds，做 OOF
oof_pred_train_meta = np.zeros_like(y_train_used, dtype=float)

for fold_idx, (tr_idx, val_idx) in enumerate(cv_indices_meta, 1):
    print(f"  -> meta OOF fold {fold_idx} / {len(cv_indices_meta)}")
    rf_fold = RandomForestRegressor(
        **best_params_meta,
        n_jobs=-1,
        random_state=GLOBAL_SEED + 500 + fold_idx,
    )
    rf_fold.fit(X_meta_train[tr_idx], y_train_used[tr_idx])
    oof_pred_train_meta[val_idx] = rf_fold.predict(X_meta_train[val_idx])

metrics_oof_meta = compute_metrics(y_train_used, oof_pred_train_meta)
print("\n===== Late T+G+P meta-RF：train OOF 表现 =====")
for k, v in metrics_oof_meta.items():
    print(f"{k}: {v:.4f}")

# 4) 在整个 meta-train 上拟合最终 meta-RF，并在 meta-test 上评估
rf_meta_final = RandomForestRegressor(
    **best_params_meta,
    n_jobs=-1,
    random_state=GLOBAL_SEED + 600,
)
rf_meta_final.fit(X_meta_train, y_train_used)

y_meta_train_pred = rf_meta_final.predict(X_meta_train)
y_meta_test_pred  = rf_meta_final.predict(X_meta_test)

metrics_meta_train = compute_metrics(y_train_used, y_meta_train_pred)
metrics_meta_test  = compute_metrics(y_test_used,  y_meta_test_pred)

print("\n===== Late T+G+P meta-RF 训练集表现 =====")
for k, v in metrics_meta_train.items():
    print(f"{k}: {v:.4f}")

print("\n===== Late T+G+P meta-RF 测试集表现（独立 20% 交集样本）=====")
for k, v in metrics_meta_test.items():
    print(f"{k}: {v:.4f}")



==== [Late T+G+P] meta-RF 十折随机搜索 ====
Fitting 10 folds for each of 30 candidates, totalling 300 fits

===== Late 三模态 meta-RF 最优超参 =====
{'max_depth': 5, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 976}
CV 平均 R^2: 0.5427
  -> meta OOF fold 1 / 10
  -> meta OOF fold 2 / 10
  -> meta OOF fold 3 / 10
  -> meta OOF fold 4 / 10
  -> meta OOF fold 5 / 10
  -> meta OOF fold 6 / 10
  -> meta OOF fold 7 / 10
  -> meta OOF fold 8 / 10
  -> meta OOF fold 9 / 10
  -> meta OOF fold 10 / 10

===== Late T+G+P meta-RF：train OOF 表现 =====
r2: 0.5596
mae: 0.5876
rmse: 0.8149
r: 0.7483

===== Late T+G+P meta-RF 训练集表现 =====
r2: 0.6409
mae: 0.5341
rmse: 0.7359
r: 0.8029

===== Late T+G+P meta-RF 测试集表现（独立 20% 交集样本）=====
r2: 0.7407
mae: 0.5134
rmse: 0.6336
r: 0.8732


In [7]:
# ===== Cell 7: 保存 Late T+G+P 结果（包含融合后的 3 维 embedding） =====

LATE_OUT_DIR = ROOT_MULTI / "late_fusion_TGP"
LATE_OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) meta-level 特征 = 三模态后期融合 embedding（3 维）
np.save(LATE_OUT_DIR / "X_meta_train_TGP.npy", X_meta_train.astype(np.float32))
np.save(LATE_OUT_DIR / "X_meta_test_TGP.npy",  X_meta_test.astype(np.float32))

# 更直观命名一份：late-level embeddings
np.save(LATE_OUT_DIR / "late_TGP_emb_train.npy", X_meta_train.astype(np.float32))
np.save(LATE_OUT_DIR / "late_TGP_emb_test.npy",  X_meta_test.astype(np.float32))

# 2) 标签 & meta-RF 预测 & OOF
np.save(LATE_OUT_DIR / "y_train_TGP_late.npy",      y_train_used.astype(np.float32))
np.save(LATE_OUT_DIR / "y_test_TGP_late.npy",       y_test_used.astype(np.float32))
np.save(LATE_OUT_DIR / "y_meta_train_pred_TGP.npy", y_meta_train_pred.astype(np.float32))
np.save(LATE_OUT_DIR / "y_meta_test_pred_TGP.npy",  y_meta_test_pred.astype(np.float32))
np.save(LATE_OUT_DIR / "oof_pred_train_TGP_late.npy", oof_pred_train_meta.astype(np.float32))

# 3) df 行号交集索引（方便后面和 early/mid/单模态对齐）
np.save(LATE_OUT_DIR / "late_TGP_train_idx.npy", train_idx_inter.astype(np.int64))
np.save(LATE_OUT_DIR / "late_TGP_test_idx.npy",  test_idx_inter.astype(np.int64))

# 分组标签也存一下
np.save(LATE_OUT_DIR / "groups_train_TGP_late.npy", groups_train_meta.astype("U"))
np.save(LATE_OUT_DIR / "groups_test_TGP_late.npy",  groups_test_meta.astype("U"))

# 4) 保存 meta-RF 模型 & 指标
with open(LATE_OUT_DIR / "meta_rf_TGP_model.pkl", "wb") as f:
    pickle.dump(rf_meta_final, f)

metrics_all_late = {
    "cv_best_r2": float(best_cv_meta),
    "oof_metrics": metrics_oof_meta,
    "train_metrics": metrics_meta_train,
    "test_metrics":  metrics_meta_test,
    "best_params_meta": {
        k: (int(v) if isinstance(v, np.integer) else v)
        for k, v in best_params_meta.items()
    },
    "n_train": int(len(y_train_used)),
    "n_test":  int(len(y_test_used)),
}

with open(LATE_OUT_DIR / "metrics_TGP_late_meta_rf.json", "w", encoding="utf-8") as f:
    json.dump(metrics_all_late, f, indent=2, ensure_ascii=False, default=np_encoder)

print("\n✅ 三模态后期融合 (Text+Graph+PhysChem) 全流程完成")
print("   - late_TGP_emb_train.npy / late_TGP_emb_test.npy 为 3 维 late-level embeddings")
print("   - late_TGP_train_idx.npy / late_TGP_test_idx.npy 为对应 df 行号（交集样本）")



✅ 三模态后期融合 (Text+Graph+PhysChem) 全流程完成
   - late_TGP_emb_train.npy / late_TGP_emb_test.npy 为 3 维 late-level embeddings
   - late_TGP_train_idx.npy / late_TGP_test_idx.npy 为对应 df 行号（交集样本）
