In [1]:
# ===== Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import numpy as np
import pandas as pd
import json

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

def compute_metrics(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return {
        "r2":   float(r2_score(y_true, y_pred)),
        "mae":  float(mean_absolute_error(y_true, y_pred)),
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "r":    float(pearsonr(y_true, y_pred)[0]),
    }

def np_encoder(o):
    if isinstance(o, (np.integer,)):  return int(o)
    if isinstance(o, (np.floating,)): return float(o)
    if isinstance(o, np.ndarray):     return o.tolist()
    raise TypeError


In [2]:
# ===== Cell 2: 路径 & 原始 df，用于取 SMILES 分组 + 构建 log 标签 =====
from pathlib import Path
import numpy as np
import pandas as pd

ROOT_MULTI   = Path("/root/Invertebrates_EC50_multi_fusion")

# 单模态输出路径（根据你之前的脚本）
TEXT_OUT_DIR  = ROOT_MULTI / "SMILES" / "smiles_outputs"
GRAPH_OUT_DIR = ROOT_MULTI / "graph"  / "graph_outputs"

# 原始数据（只用来取 SMILES 和原始 mgperL）
DATA_PATH   = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")

SMILES_COL  = "SMILES_Canonical_RDKit"
LABEL_RAW   = "mgperL"       # ✅ 这是你数据里已有的原始列
LABEL_LOG   = "mgperL_log"   # ✅ 我们现在自己构建这一列

df = pd.read_excel(DATA_PATH, engine="openpyxl")

# 确保行号和你当时 GroupShuffleSplit 的 index 对齐
# （之前 RF/单模态训练时也是用这种行号的话，这里要保持一致）
df = df.reset_index(drop=True)

# 原始浓度转成数值
df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")

# 构建 log10(mgperL)，只对 >0 的做 log10，其他给 NaN
mask_pos = df[LABEL_RAW] > 0
df[LABEL_LOG] = np.where(mask_pos, np.log10(df[LABEL_RAW]), np.nan)

print("mgperL 原始列前几行：")
print(df[[LABEL_RAW]].head())

print("\n构建后的 mgperL_log 前几行：")
print(df[[LABEL_LOG]].head())

# 后面统一用 LABEL_LOG 作为 y_all
groups_all = df[SMILES_COL].astype(str).values
y_all      = df[LABEL_LOG].values

print("\n有效标签数量（非 NaN）:", np.isfinite(y_all).sum())
print("df 形状:", df.shape)


mgperL 原始列前几行：
   mgperL
0     1.3
1     2.5
2    40.8
3     1.9
4     0.6

构建后的 mgperL_log 前几行：
   mgperL_log
0    0.113943
1    0.397940
2    1.610660
3    0.278754
4   -0.221849

有效标签数量（非 NaN）: 3620
df 形状: (3620, 35)


In [3]:
# ===== Cell 3: 加载 Text / Graph 单模态结果 =====

# --- Text 端 ---
y_train_T        = np.load(TEXT_OUT_DIR  / "rf_text_y_train.npy")
y_test_T         = np.load(TEXT_OUT_DIR  / "rf_text_y_test.npy")
oof_train_T      = np.load(TEXT_OUT_DIR  / "rf_text_oof_pred_train.npy")
y_pred_test_T    = np.load(TEXT_OUT_DIR  / "rf_text_y_pred_test.npy")
train_idx_T      = np.load(TEXT_OUT_DIR  / "rf_text_train_idx.npy")
test_idx_T       = np.load(TEXT_OUT_DIR  / "rf_text_test_idx.npy")

print("Text train / test:", len(y_train_T), len(y_test_T))

# --- Graph 端 ---
y_train_G        = np.load(GRAPH_OUT_DIR / "rf_graph_y_train.npy")
y_test_G         = np.load(GRAPH_OUT_DIR / "rf_graph_y_test.npy")
oof_train_G      = np.load(GRAPH_OUT_DIR / "rf_graph_oof_pred_train.npy")
y_pred_test_G    = np.load(GRAPH_OUT_DIR / "rf_graph_y_pred_test.npy")
train_idx_G      = np.load(GRAPH_OUT_DIR / "rf_graph_train_idx.npy")
test_idx_G       = np.load(GRAPH_OUT_DIR  / "rf_graph_test_idx.npy")

print("Graph train / test:", len(y_train_G), len(y_test_G))

# ===== 交集：只保留两边都存在的样本 =====
train_idx_inter = np.intersect1d(train_idx_T, train_idx_G)
test_idx_inter  = np.intersect1d(test_idx_T,  test_idx_G)

print("Train 交集样本数:", len(train_idx_inter))
print("Test  交集样本数:", len(test_idx_inter))

# 建立 row_idx -> 在各自 train/test 数组中的位置
pos_T_train = {idx: i for i, idx in enumerate(train_idx_T)}
pos_G_train = {idx: i for i, idx in enumerate(train_idx_G)}
pos_T_test  = {idx: i for i, idx in enumerate(test_idx_T)}
pos_G_test  = {idx: i for i, idx in enumerate(test_idx_G)}

# 按交集顺序收集 OOF / 预测 / 标签
oof_T_train_used, oof_G_train_used = [], []
y_train_used, groups_train_used = [], []

for rid in train_idx_inter:
    iT = pos_T_train[rid]
    iG = pos_G_train[rid]

    oof_T_train_used.append(oof_train_T[iT])
    oof_G_train_used.append(oof_train_G[iG])

    # 标签直接从 df 拿（两模态本质上应该一致）
    y_train_used.append(y_all[rid])
    groups_train_used.append(groups_all[rid])

oof_T_train_used = np.array(oof_T_train_used, dtype=float)
oof_G_train_used = np.array(oof_G_train_used, dtype=float)
y_train_used     = np.array(y_train_used,     dtype=float)
groups_train_used= np.array(groups_train_used)

# test 集
pred_T_test_used, pred_G_test_used = [], []
y_test_used, groups_test_used = [], []

for rid in test_idx_inter:
    iT = pos_T_test[rid]
    iG = pos_G_test[rid]

    pred_T_test_used.append(y_pred_test_T[iT])
    pred_G_test_used.append(y_pred_test_G[iG])

    y_test_used.append(y_all[rid])
    groups_test_used.append(groups_all[rid])

pred_T_test_used = np.array(pred_T_test_used, dtype=float)
pred_G_test_used = np.array(pred_G_test_used, dtype=float)
y_test_used      = np.array(y_test_used,      dtype=float)
groups_test_used = np.array(groups_test_used)

print("最终用于 late fusion 的 train / test 样本数:", len(y_train_used), len(y_test_used))


Text train / test: 2889 731
Graph train / test: 2581 632
Train 交集样本数: 2071
Test  交集样本数: 133
最终用于 late fusion 的 train / test 样本数: 2071 133


In [4]:
# ===== Cell 4: 构造 meta 特征 & 训练后期融合 RF (Text+Graph) =====

# meta level 特征：两模态的 OOF / 预测拼在一起
X_meta_train = np.column_stack([oof_T_train_used, oof_G_train_used])  # (N_train, 2)
X_meta_test  = np.column_stack([pred_T_test_used, pred_G_test_used])  # (N_test, 2)

print("X_meta_train 形状:", X_meta_train.shape)
print("X_meta_test  形状:", X_meta_test.shape)

# meta RF
param_distributions_meta = {
    "n_estimators":      randint(200, 1001),
    "max_depth":         [None, 5, 10, 20],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf":  randint(1, 5),
    "max_features":      ["sqrt", "log2", 0.5, 0.8],
}

rf_meta_base = RandomForestRegressor(
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)

cv_meta = GroupKFold(n_splits=10)

rf_meta_search = RandomizedSearchCV(
    estimator=rf_meta_base,
    param_distributions=param_distributions_meta,
    n_iter=30,
    scoring="r2",
    cv=cv_meta,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("\n==== 开始 Text+Graph 后期融合 (meta RF) 十折随机搜索 ====")
rf_meta_search.fit(X_meta_train, y_train_used, groups=groups_train_used)

best_params_meta = rf_meta_search.best_params_
best_cv_meta     = rf_meta_search.best_score_

print("\n===== Late Fusion (T+G) meta-RF 最优超参 =====")
print(best_params_meta)
print(f"CV 平均 R^2: {best_cv_meta:.4f}")

# 用最优超参重新训练 meta RF
rf_meta_final = RandomForestRegressor(
    **best_params_meta,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)
rf_meta_final.fit(X_meta_train, y_train_used)

y_meta_train_pred = rf_meta_final.predict(X_meta_train)
y_meta_test_pred  = rf_meta_final.predict(X_meta_test)

metrics_meta_train = compute_metrics(y_train_used, y_meta_train_pred)
metrics_meta_test  = compute_metrics(y_test_used,  y_meta_test_pred)

print("\n===== Late Fusion (T+G) train 表现 =====")
for k, v in metrics_meta_train.items():
    print(f"{k}: {v:.4f}")

print("\n===== Late Fusion (T+G) test 表现 =====")
for k, v in metrics_meta_test.items():
    print(f"{k}: {v:.4f}")


X_meta_train 形状: (2071, 2)
X_meta_test  形状: (133, 2)

==== 开始 Text+Graph 后期融合 (meta RF) 十折随机搜索 ====
Fitting 10 folds for each of 30 candidates, totalling 300 fits

===== Late Fusion (T+G) meta-RF 最优超参 =====
{'max_depth': 10, 'max_features': 0.5, 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 299}
CV 平均 R^2: 0.5447

===== Late Fusion (T+G) train 表现 =====
r2: 0.7000
mae: 0.4811
rmse: 0.6690
r: 0.8404

===== Late Fusion (T+G) test 表现 =====
r2: 0.6193
mae: 0.5912
rmse: 0.8787
r: 0.7972


In [5]:
# ===== Cell 5: 保存 T+G 后期融合结果（包含融合后的 embeddings）=====
LATE_OUT_DIR = ROOT_MULTI / "late_fusion_pairs" / "T_G"
LATE_OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) 原来的 meta 特征（其实就是融合后的 2 维 embedding）
np.save(LATE_OUT_DIR / "X_meta_train_T_G.npy", X_meta_train.astype(np.float32))
np.save(LATE_OUT_DIR / "X_meta_test_T_G.npy",  X_meta_test.astype(np.float32))

# 2) 用更直观的命名再保存一份，明确这是“融合后的 embeddings”
np.save(LATE_OUT_DIR / "late_T_G_emb_train.npy", X_meta_train.astype(np.float32))  # (N_train, 2)
np.save(LATE_OUT_DIR / "late_T_G_emb_test.npy",  X_meta_test.astype(np.float32))   # (N_test, 2)

# 3) 标签 & meta-RF 预测
np.save(LATE_OUT_DIR / "y_train_T_G.npy",      y_train_used.astype(np.float32))
np.save(LATE_OUT_DIR / "y_test_T_G.npy",       y_test_used.astype(np.float32))
np.save(LATE_OUT_DIR / "y_meta_train_pred_T_G.npy", y_meta_train_pred.astype(np.float32))
np.save(LATE_OUT_DIR / "y_meta_test_pred_T_G.npy",  y_meta_test_pred.astype(np.float32))

# 4) 对应的样本索引（df 的行号交集）
np.save(LATE_OUT_DIR / "late_T_G_train_idx.npy", train_idx_inter.astype(np.int64))
np.save(LATE_OUT_DIR / "late_T_G_test_idx.npy",  test_idx_inter.astype(np.int64))

# 5) 指标 & 超参
with open(LATE_OUT_DIR / "metrics_T_G_meta_rf.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "cv_best_r2": float(best_cv_meta),
            "train_metrics": metrics_meta_train,
            "test_metrics":  metrics_meta_test,
            "best_params_meta": best_params_meta,
            "n_train": int(len(y_train_used)),
            "n_test":  int(len(y_test_used)),
        },
        f, ensure_ascii=False, indent=2, default=np_encoder
    )

print("\n✅ Text+Graph 后期融合结果已保存到:", LATE_OUT_DIR)
print("   - late_T_G_emb_train.npy / late_T_G_emb_test.npy  为融合后的 2 维 embedding")
print("   - late_T_G_train_idx.npy / late_T_G_test_idx.npy   为对应 df 行号（交集样本）")



✅ Text+Graph 后期融合结果已保存到: /root/Invertebrates_EC50_multi_fusion/late_fusion_pairs/T_G
   - late_T_G_emb_train.npy / late_T_G_emb_test.npy  为融合后的 2 维 embedding
   - late_T_G_train_idx.npy / late_T_G_test_idx.npy   为对应 df 行号（交集样本）
