In [1]:
# ===== Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import numpy as np
import pandas as pd
import json

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

def compute_metrics(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return {
        "r2":   float(r2_score(y_true, y_pred)),
        "mae":  float(mean_absolute_error(y_true, y_pred)),
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "r":    float(pearsonr(y_true, y_pred)[0]),
    }

def np_encoder(o):
    if isinstance(o, (np.integer,)):  return int(o)
    if isinstance(o, (np.floating,)): return float(o)
    if isinstance(o, np.ndarray):     return o.tolist()
    raise TypeError


In [2]:
# ===== Cell 2: 路径 & 原始 df，用于取 SMILES 分组 + 构建 log 标签 (T+P) =====
from pathlib import Path
import numpy as np
import pandas as pd

ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# 单模态输出路径
TEXT_OUT_DIR = ROOT_MULTI / "SMILES" / "smiles_outputs"
PHY_OUT_DIR  = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"

# 原始数据（和单模态训练时用的要一致）
DATA_PATH = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")

SMILES_COL = "SMILES_Canonical_RDKit"
LABEL_RAW  = "mgperL"       # 原始浓度列（数据集中已有）
LABEL_LOG  = "mgperL_log"   # 我们现在自己构建这一列

df = pd.read_excel(DATA_PATH, engine="openpyxl")

# 确保行号和当时 RF/单模态训练时的索引对齐（一般是 0..N-1）
df = df.reset_index(drop=True)

# 原始浓度转为数值
df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")

# 构建 log10(mgperL)，只对 >0 的取 log10，其他设为 NaN
mask_pos = df[LABEL_RAW] > 0
df[LABEL_LOG] = np.where(mask_pos, np.log10(df[LABEL_RAW]), np.nan)

print("原始 mgperL 列前几行：")
print(df[[LABEL_RAW]].head())

print("\n构建后的 mgperL_log 列前几行：")
print(df[[LABEL_LOG]].head())

# 后期融合统一用 log10(mg/L) 作为标签
groups_all = df[SMILES_COL].astype(str).values
y_all      = df[LABEL_LOG].values

print("\n有效标签数量（非 NaN）:", np.isfinite(y_all).sum())
print("df 形状:", df.shape)


原始 mgperL 列前几行：
   mgperL
0     1.3
1     2.5
2    40.8
3     1.9
4     0.6

构建后的 mgperL_log 列前几行：
   mgperL_log
0    0.113943
1    0.397940
2    1.610660
3    0.278754
4   -0.221849

有效标签数量（非 NaN）: 3620
df 形状: (3620, 35)


In [3]:
# ===== Cell 3: 加载 Text / PhysChem 单模态结果 =====

# Text 同上
y_train_T        = np.load(TEXT_OUT_DIR  / "rf_text_y_train.npy")
y_test_T         = np.load(TEXT_OUT_DIR  / "rf_text_y_test.npy")
oof_train_T      = np.load(TEXT_OUT_DIR  / "rf_text_oof_pred_train.npy")
y_pred_test_T    = np.load(TEXT_OUT_DIR  / "rf_text_y_pred_test.npy")
train_idx_T      = np.load(TEXT_OUT_DIR  / "rf_text_train_idx.npy")
test_idx_T       = np.load(TEXT_OUT_DIR  / "rf_text_test_idx.npy")

# PhysChem 端路径（你根据实际调整）
PHY_OUT_DIR      = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"

y_train_P        = np.load(PHY_OUT_DIR    / "rf_phys_y_train.npy")
y_test_P         = np.load(PHY_OUT_DIR    / "rf_phys_y_test.npy")
oof_train_P      = np.load(PHY_OUT_DIR    / "rf_phys_oof_pred_train.npy")
y_pred_test_P    = np.load(PHY_OUT_DIR    / "rf_phys_y_pred_test.npy")
train_idx_P      = np.load(PHY_OUT_DIR    / "rf_phys_train_idx.npy")
test_idx_P       = np.load(PHY_OUT_DIR    / "rf_phys_test_idx.npy")

# 交集
train_idx_inter = np.intersect1d(train_idx_T, train_idx_P)
test_idx_inter  = np.intersect1d(test_idx_T,  test_idx_P)

print("Train 交集:", len(train_idx_inter))
print("Test  交集:", len(test_idx_inter))

pos_T_train = {idx: i for i, idx in enumerate(train_idx_T)}
pos_P_train = {idx: i for i, idx in enumerate(train_idx_P)}
pos_T_test  = {idx: i for i, idx in enumerate(test_idx_T)}
pos_P_test  = {idx: i for i, idx in enumerate(test_idx_P)}

oof_T_train_used, oof_P_train_used = [], []
y_train_used, groups_train_used = [], []

for rid in train_idx_inter:
    iT = pos_T_train[rid]
    iP = pos_P_train[rid]

    oof_T_train_used.append(oof_train_T[iT])
    oof_P_train_used.append(oof_train_P[iP])

    y_train_used.append(y_all[rid])
    groups_train_used.append(groups_all[rid])

oof_T_train_used = np.array(oof_T_train_used, dtype=float)
oof_P_train_used = np.array(oof_P_train_used, dtype=float)
y_train_used     = np.array(y_train_used,     dtype=float)
groups_train_used= np.array(groups_train_used)

pred_T_test_used, pred_P_test_used = [], []
y_test_used, groups_test_used = [], []

for rid in test_idx_inter:
    iT = pos_T_test[rid]
    iP = pos_P_test[rid]

    pred_T_test_used.append(y_pred_test_T[iT])
    pred_P_test_used.append(y_pred_test_P[iP])

    y_test_used.append(y_all[rid])
    groups_test_used.append(groups_all[rid])

pred_T_test_used = np.array(pred_T_test_used, dtype=float)
pred_P_test_used = np.array(pred_P_test_used, dtype=float)
y_test_used      = np.array(y_test_used,      dtype=float)
groups_test_used = np.array(groups_test_used)

print("最终用于 T+P late fusion 的 train / test:", len(y_train_used), len(y_test_used))


Train 交集: 2136
Test  交集: 128
最终用于 T+P late fusion 的 train / test: 2136 128


In [4]:
# ===== Cell 4: 构造 meta 特征 & 训练后期融合 RF (Text+PhysChem) =====
X_meta_train = np.column_stack([oof_T_train_used, oof_P_train_used])
X_meta_test  = np.column_stack([pred_T_test_used, pred_P_test_used])

param_distributions_meta = {
    "n_estimators":      randint(200, 1001),
    "max_depth":         [None, 5, 10, 20],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf":  randint(1, 5),
    "max_features":      ["sqrt", "log2", 0.5, 0.8],
}

rf_meta_base = RandomForestRegressor(
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)

cv_meta = GroupKFold(n_splits=10)

rf_meta_search = RandomizedSearchCV(
    estimator=rf_meta_base,
    param_distributions=param_distributions_meta,
    n_iter=30,
    scoring="r2",
    cv=cv_meta,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("\n==== 开始 Text+PhysChem 后期融合 (meta RF) 十折随机搜索 ====")
rf_meta_search.fit(X_meta_train, y_train_used, groups=groups_train_used)

best_params_meta = rf_meta_search.best_params_
best_cv_meta     = rf_meta_search.best_score_
print("\n===== Late Fusion (T+P) meta-RF 最优超参 =====")
print(best_params_meta)
print(f"CV 平均 R^2: {best_cv_meta:.4f}")

rf_meta_final = RandomForestRegressor(
    **best_params_meta,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)
rf_meta_final.fit(X_meta_train, y_train_used)

y_meta_train_pred = rf_meta_final.predict(X_meta_train)
y_meta_test_pred  = rf_meta_final.predict(X_meta_test)

metrics_meta_train = compute_metrics(y_train_used, y_meta_train_pred)
metrics_meta_test  = compute_metrics(y_test_used,  y_meta_test_pred)

print("\n===== Late Fusion (T+P) train 表现 =====")
for k, v in metrics_meta_train.items():
    print(f"{k}: {v:.4f}")

print("\n===== Late Fusion (T+P) test 表现 =====")
for k, v in metrics_meta_test.items():
    print(f"{k}: {v:.4f}")



==== 开始 Text+PhysChem 后期融合 (meta RF) 十折随机搜索 ====
Fitting 10 folds for each of 30 candidates, totalling 300 fits

===== Late Fusion (T+P) meta-RF 最优超参 =====
{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 291}
CV 平均 R^2: 0.5369

===== Late Fusion (T+P) train 表现 =====
r2: 0.6023
mae: 0.5514
rmse: 0.7491
r: 0.7827

===== Late Fusion (T+P) test 表现 =====
r2: 0.4456
mae: 0.7215
rmse: 1.0004
r: 0.6735


In [5]:
# ===== Cell 5: 保存 T+P 后期融合结果（包含融合后的 embeddings）=====
LATE_OUT_DIR = ROOT_MULTI / "late_fusion_pairs" / "T_P"
LATE_OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) 原来的 meta 特征（融合后的 2 维 embedding）
np.save(LATE_OUT_DIR / "X_meta_train_T_P.npy", X_meta_train.astype(np.float32))
np.save(LATE_OUT_DIR / "X_meta_test_T_P.npy",  X_meta_test.astype(np.float32))

# 2) 直观命名：late-level 融合 embedding
np.save(LATE_OUT_DIR / "late_T_P_emb_train.npy", X_meta_train.astype(np.float32))  # (N_train, 2)
np.save(LATE_OUT_DIR / "late_T_P_emb_test.npy",  X_meta_test.astype(np.float32))   # (N_test, 2)

# 3) 标签 & meta-RF 预测
np.save(LATE_OUT_DIR / "y_train_T_P.npy",      y_train_used.astype(np.float32))
np.save(LATE_OUT_DIR / "y_test_T_P.npy",       y_test_used.astype(np.float32))
np.save(LATE_OUT_DIR / "y_meta_train_pred_T_P.npy", y_meta_train_pred.astype(np.float32))
np.save(LATE_OUT_DIR / "y_meta_test_pred_T_P.npy",  y_meta_test_pred.astype(np.float32))

# 4) df 行号交集索引
np.save(LATE_OUT_DIR / "late_T_P_train_idx.npy", train_idx_inter.astype(np.int64))
np.save(LATE_OUT_DIR / "late_T_P_test_idx.npy",  test_idx_inter.astype(np.int64))

# 5) 指标
with open(LATE_OUT_DIR / "metrics_T_P_meta_rf.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "cv_best_r2": float(best_cv_meta),
            "train_metrics": metrics_meta_train,
            "test_metrics":  metrics_meta_test,
            "best_params_meta": best_params_meta,
            "n_train": int(len(y_train_used)),
            "n_test":  int(len(y_test_used)),
        },
        f, ensure_ascii=False, indent=2, default=np_encoder
    )

print("\n✅ Text+PhysChem 后期融合结果已保存到:", LATE_OUT_DIR)
print("   - late_T_P_emb_train.npy / late_T_P_emb_test.npy  为融合后的 2 维 embedding")
print("   - late_T_P_train_idx.npy / late_T_P_test_idx.npy   为对应 df 行号（交集样本）")



✅ Text+PhysChem 后期融合结果已保存到: /root/Invertebrates_EC50_multi_fusion/late_fusion_pairs/T_P
   - late_T_P_emb_train.npy / late_T_P_emb_test.npy  为融合后的 2 维 embedding
   - late_T_P_train_idx.npy / late_T_P_test_idx.npy   为对应 df 行号（交集样本）
