In [1]:
# ===== Cell 1: import & 配置 =====
from pathlib import Path
import numpy as np
import pandas as pd
import json

from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

GLOBAL_SEED = 42
np.random.seed(GLOBAL_SEED)

def compute_metrics(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return {
        "r2":   float(r2_score(y_true, y_pred)),
        "mae":  float(mean_absolute_error(y_true, y_pred)),
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "r":    float(pearsonr(y_true, y_pred)[0]),
    }

def np_encoder(o):
    if isinstance(o, (np.integer,)):  return int(o)
    if isinstance(o, (np.floating,)): return float(o)
    if isinstance(o, np.ndarray):     return o.tolist()
    raise TypeError


In [2]:
# ===== Cell 2: 读原始数据 + 构建 log10(mg/L) 标签 =====

ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

DATA_PATH  = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")
SMILES_COL = "SMILES_Canonical_RDKit"

LABEL_RAW  = "mgperL"       # 原始浓度列（数据里有）
LABEL_LOG  = "mgperL_log"   # 我们现在自己构建

df = pd.read_excel(DATA_PATH, engine="openpyxl")
df = df.reset_index(drop=True)

# 原始浓度转成数值
df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")

# 只对 >0 的样本做 log10，其他设为 NaN
mask_pos = df[LABEL_RAW] > 0
df[LABEL_LOG] = np.where(mask_pos, np.log10(df[LABEL_RAW]), np.nan)

print(df[[LABEL_RAW, LABEL_LOG]].head())

y_all_full      = df[LABEL_LOG].values          # log10(mg/L)
groups_all_full = df[SMILES_COL].astype(str).values

print("总样本数:", len(df))
print("有效标签数(非 NaN):", np.isfinite(y_all_full).sum())


   mgperL  mgperL_log
0     1.3    0.113943
1     2.5    0.397940
2    40.8    1.610660
3     1.9    0.278754
4     0.6   -0.221849
总样本数: 3620
有效标签数(非 NaN): 3620


In [3]:
# ===== Cell 3: 加载 Text / Graph / PhysChem 嵌入，并按 df 行号对齐 =====

# ---- Text 端 CLS embedding ----
SMILES_OUT_DIR = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_PATH  = SMILES_OUT_DIR / "reg_smiles_cls_embeddings_all.npy"  # 根据你实际文件名改

X_text_all = np.load(TEXT_EMB_PATH)   # (N_df, d_text=768)
rowid_text = np.arange(len(df), dtype=int)

print("X_text_all 形状:", X_text_all.shape)

# ---- Graph 端 GNN embedding ----
GRAPH_OUT_DIR    = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_OUT_DIR / "reg_graph_embeddings.npy"          # (N_graph, d_graph)
GRAPH_ROWID_PATH = GRAPH_OUT_DIR / "row_id_graph_for_emb.npy"          # df 行号

X_graph_raw = np.load(GRAPH_EMB_PATH)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)

print("X_graph_raw 形状:", X_graph_raw.shape)
print("rowid_graph 范围:", rowid_graph.min(), "→", rowid_graph.max())

# ---- PhysChem 端 MLP embedding ----
PHY_OUT_DIR    = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_OUT_DIR / "emb_physchem_mlp_all.npy"  # 你 descMLP 里保存的全体 embedding
PHY_ROWID_PATH = PHY_OUT_DIR / "row_id_clean.npy"          # 对应 df 行号

X_phys_raw = np.load(PHY_EMB_PATH)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)

print("X_phys_raw 形状:", X_phys_raw.shape)
print("rowid_phys 范围:", rowid_phys.min(), "→", rowid_phys.max())

# ---- 三模态 row_id 交集 ----
ids_text  = set(rowid_text.tolist())
ids_graph = set(rowid_graph.tolist())
ids_phys  = set(rowid_phys.tolist())

ids_inter = sorted(list(ids_text & ids_graph & ids_phys))
print("三模态交集样本数:", len(ids_inter))

idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}
idx_map_phys  = {rid: i for i, rid in enumerate(rowid_phys)}

X_text_list, X_graph_list, X_phys_list = [], [], []
y_list, groups_list, rid_list = [], [], []

for rid in ids_inter:
    y_val = y_all_full[rid]
    if not np.isfinite(y_val):
        continue  # 把 mgperL<=0 或缺失的样本丢掉

    X_text_list.append(X_text_all[rid])
    X_graph_list.append(X_graph_raw[idx_map_graph[rid]])
    X_phys_list.append(X_phys_raw[idx_map_phys[rid]])

    y_list.append(y_all_full[rid])
    groups_list.append(groups_all_full[rid])
    rid_list.append(rid)

X_text = np.stack(X_text_list, axis=0)   # (N, d_text)
X_graph= np.stack(X_graph_list, axis=0)  # (N, d_graph)
X_phys = np.stack(X_phys_list, axis=0)   # (N, d_phys)
y_all  = np.array(y_list, dtype=float)   # (N,)
groups = np.array(groups_list)           # (N,)
rowid  = np.array(rid_list, dtype=int)   # (N,)

print("对齐后 X_text:", X_text.shape)
print("对齐后 X_graph:", X_graph.shape)
print("对齐后 X_phys:", X_phys.shape)
print("y_all 形状:", y_all.shape)


X_text_all 形状: (3620, 768)
X_graph_raw 形状: (3213, 256)
rowid_graph 范围: 1 → 3619
X_phys_raw 形状: (3406, 64)
rowid_phys 范围: 0 → 3619
三模态交集样本数: 3103
对齐后 X_text: (3103, 768)
对齐后 X_graph: (3103, 256)
对齐后 X_phys: (3103, 64)
y_all 形状: (3103,)


In [4]:
# ===== Cell 4: 构建 meta 特征 (Duration / Effect / Endpoint) =====

NUM_META_COLS_CANDIDATE = ["Duration_Value(hour)"]   # 数值型
CAT_META_COLS_CANDIDATE = ["Effect", "Endpoint"]     # 类别型

NUM_META_COLS = [c for c in NUM_META_COLS_CANDIDATE if c in df.columns]
CAT_META_COLS = [c for c in CAT_META_COLS_CANDIDATE if c in df.columns]

print("数值型 meta 列:", NUM_META_COLS)
print("类别型 meta 列:", CAT_META_COLS)

# 针对对齐后的 rowid 取 meta
df_meta = df.loc[rowid].copy().reset_index(drop=True)

# 数值型
if NUM_META_COLS:
    scaler_meta = StandardScaler()
    X_num = scaler_meta.fit_transform(df_meta[NUM_META_COLS].values)
else:
    X_num = np.zeros((len(df_meta), 0), dtype=np.float32)

# 类别型
if CAT_META_COLS:
    ohe_meta = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat = ohe_meta.fit_transform(df_meta[CAT_META_COLS].astype(str))
else:
    X_cat = np.zeros((len(df_meta), 0), dtype=np.float32)

X_meta = np.concatenate([X_num, X_cat], axis=1)
print("X_meta 形状:", X_meta.shape)


数值型 meta 列: ['Duration_Value(hour)']
类别型 meta 列: ['Effect', 'Endpoint']
X_meta 形状: (3103, 4)


In [5]:
# ===== Cell 5: 8:2 按 SMILES 分组划分 + TEXT PCA 256 + 三模态早期拼接 =====
from sklearn.decomposition import PCA

# 1) 8:2 GroupShuffleSplit（针对三模态对齐后的这 N 个样本）
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2025)
idx_all = np.arange(len(y_all))
train_idx, test_idx = next(gss.split(idx_all, y_all, groups))

train_idx = train_idx.astype(int)
test_idx  = test_idx.astype(int)

print("Early 三模态 train 样本数:", len(train_idx))
print("Early 三模态 test  样本数:", len(test_idx))

# 切分
X_text_train, X_text_test   = X_text[train_idx],  X_text[test_idx]
X_graph_train, X_graph_test = X_graph[train_idx], X_graph[test_idx]
X_phys_train, X_phys_test   = X_phys[train_idx],  X_phys[test_idx]
X_meta_train, X_meta_test   = X_meta[train_idx],  X_meta[test_idx]

y_train, y_test       = y_all[train_idx],  y_all[test_idx]
groups_train          = groups[train_idx]
rowid_train, rowid_test = rowid[train_idx], rowid[test_idx]

print("原始 TEXT 维度:", X_text_train.shape[1])

# 2) TEXT 768 -> 256 (只在 train 上 fit PCA，test 上 transform)
PCA_TEXT_DIM = 256
pca_text = PCA(n_components=PCA_TEXT_DIM, random_state=GLOBAL_SEED)

X_text_train_256 = pca_text.fit_transform(X_text_train)
X_text_test_256  = pca_text.transform(X_text_test)

print("降维后 TEXT 维度:", X_text_train_256.shape[1])

# 3) 最终早期融合特征 = TEXT(256) + GRAPH + PHY + META
X_train_early = np.concatenate(
    [X_text_train_256, X_graph_train, X_phys_train, X_meta_train],
    axis=1
)
X_test_early = np.concatenate(
    [X_text_test_256, X_graph_test, X_phys_test, X_meta_test],
    axis=1
)

print("X_train_early 形状:", X_train_early.shape)
print("X_test_early  形状:", X_test_early.shape)


Early 三模态 train 样本数: 2477
Early 三模态 test  样本数: 626
原始 TEXT 维度: 768
降维后 TEXT 维度: 256
X_train_early 形状: (2477, 580)
X_test_early  形状: (626, 580)


In [6]:
# ===== Cell 6: RF 十折调参 + OOF 计算 + 最终模型评估 =====

# 1) 超参数空间
param_distributions = {
    "n_estimators":      randint(200, 1001),
    "max_depth":         [None, 10, 20, 30],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf":  randint(1, 5),
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

base_rf = RandomForestRegressor(
    n_jobs=-1,
    random_state=GLOBAL_SEED,
)

# 2) 固定一套十折 GroupKFold，调参和 OOF 共用
cv_rf = GroupKFold(n_splits=10)
cv_indices = list(cv_rf.split(X_train_early, y_train, groups_train))

rf_search = RandomizedSearchCV(
    estimator=base_rf,
    param_distributions=param_distributions,
    n_iter=30,
    scoring="r2",
    cv=cv_indices,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("\n==== [Early T+G+P] 在 train80% 上做十折随机搜索 ====")
rf_search.fit(X_train_early, y_train, groups=groups_train)

best_params_early = rf_search.best_params_
best_cv_score_early = rf_search.best_score_

print("\n===== Early 三模态 RF 最优超参 =====")
print(best_params_early)
print(f"CV 平均 R^2: {best_cv_score_early:.4f}")

# 3) 用最优超参 + 同一套 folds，计算 train OOF 预测
oof_pred_train = np.zeros_like(y_train, dtype=float)

for fold_idx, (tr_idx, val_idx) in enumerate(cv_indices, 1):
    print(f"  -> OOF fold {fold_idx} / {len(cv_indices)}")
    rf_fold = RandomForestRegressor(
        **best_params_early,
        n_jobs=-1,
        random_state=GLOBAL_SEED + fold_idx,
    )
    rf_fold.fit(X_train_early[tr_idx], y_train[tr_idx])
    oof_pred_train[val_idx] = rf_fold.predict(X_train_early[val_idx])

metrics_oof = compute_metrics(y_train, oof_pred_train)
print("\n===== Early 三模态 RF：train OOF 表现 =====")
for k, v in metrics_oof.items():
    print(f"{k}: {v:.4f}")

# 4) 在整个 train80% 上拟合最终模型，并在 test20% 上评估
best_rf_early = RandomForestRegressor(
    **best_params_early,
    n_jobs=-1,
    random_state=GLOBAL_SEED + 100,
)
best_rf_early.fit(X_train_early, y_train)

y_train_pred = best_rf_early.predict(X_train_early)
y_test_pred  = best_rf_early.predict(X_test_early)

metrics_train = compute_metrics(y_train, y_train_pred)
metrics_test  = compute_metrics(y_test,  y_test_pred)

print("\n===== Early 三模态 RF 训练集表现 =====")
for k, v in metrics_train.items():
    print(f"{k}: {v:.4f}")

print("\n===== Early 三模态 RF 测试集表现（独立 20%）=====")
for k, v in metrics_test.items():
    print(f"{k}: {v:.4f}")



==== [Early T+G+P] 在 train80% 上做十折随机搜索 ====
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=9, n_estimators=900; total time=47.6min
[CV] END max_depth=20, max_features=0.8, min_samples_leaf=2, min_samples_split=9, n_estimators=691; total time=53.3min
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=3, min_samples_split=6, n_estimators=846; total time= 4.9min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=587; total time= 3.5min
[CV] END max_depth=10, max_features=0.8, min_samples_leaf=2, min_samples_split=5, n_estimators=291; total time=19.4min
[CV] END max_depth=10, max_features=0.8, min_samples_leaf=2, min_samples_split=5, n_estimators=291; total time=19.0min
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=3, min_samples_split=9, n_estimators=234; total time=10.3min
[CV] END max_depth=20, max_features=0.5, min_samples_le

In [7]:
# ===== Cell 7: 保存三模态早期融合的所有结果 =====
import pickle

EARLY_OUT_DIR = ROOT_MULTI / "early_fusion_TGP"
EARLY_OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) 融合后的特征（可以当 early-level embeddings 用）
np.save(EARLY_OUT_DIR / "X_train_TGP_early.npy", X_train_early.astype(np.float32))
np.save(EARLY_OUT_DIR / "X_test_TGP_early.npy",  X_test_early.astype(np.float32))

# 2) TEXT 降维后的 embedding
np.save(EARLY_OUT_DIR / "text_256_train.npy", X_text_train_256.astype(np.float32))
np.save(EARLY_OUT_DIR / "text_256_test.npy",  X_text_test_256.astype(np.float32))

# 3) 标签 / 预测 / OOF
np.save(EARLY_OUT_DIR / "y_train_TGP.npy",      y_train.astype(np.float32))
np.save(EARLY_OUT_DIR / "y_test_TGP.npy",       y_test.astype(np.float32))
np.save(EARLY_OUT_DIR / "y_pred_train_TGP_early.npy", y_train_pred.astype(np.float32))
np.save(EARLY_OUT_DIR / "y_pred_test_TGP_early.npy",  y_test_pred.astype(np.float32))
np.save(EARLY_OUT_DIR / "oof_pred_train_TGP_early.npy", oof_pred_train.astype(np.float32))

# 4) 索引（方便后面和其他模态 / late fusion 对齐）
np.save(EARLY_OUT_DIR / "rowid_TGP.npy",        rowid.astype(np.int64))       # 所有三模态对齐样本在 df 的行号
np.save(EARLY_OUT_DIR / "train_idx_TGP_early.npy", train_idx.astype(np.int64))# 早期融合 train 部分在 rowid 中的索引
np.save(EARLY_OUT_DIR / "test_idx_TGP_early.npy",  test_idx.astype(np.int64)) # 早期融合 test 部分在 rowid 中的索引
np.save(EARLY_OUT_DIR / "rowid_train_TGP_early.npy", rowid_train.astype(np.int64))
np.save(EARLY_OUT_DIR / "rowid_test_TGP_early.npy",  rowid_test.astype(np.int64))

# 5) RF 模型 & PCA 模型 & 指标
with open(EARLY_OUT_DIR / "rf_TGP_early_model.pkl", "wb") as f:
    pickle.dump(best_rf_early, f)

with open(EARLY_OUT_DIR / "pca_text_256.pkl", "wb") as f:
    pickle.dump(pca_text, f)

metrics_all = {
    "cv_best_r2": float(best_cv_score_early),
    "oof_metrics": metrics_oof,
    "train_metrics": metrics_train,
    "test_metrics": metrics_test,
    "best_params": {
        k: (int(v) if isinstance(v, np.integer) else v)
        for k, v in best_params_early.items()
    },
    "n_train": int(len(y_train)),
    "n_test":  int(len(y_test)),
}

with open(EARLY_OUT_DIR / "metrics_TGP_early_rf.json", "w", encoding="utf-8") as f:
    json.dump(metrics_all, f, indent=2, ensure_ascii=False, default=np_encoder)

print("\n✅ 三模态早期融合 (T+G+P, TEXT→256) 全流程完成，结果已保存到:", EARLY_OUT_DIR)



✅ 三模态早期融合 (T+G+P, TEXT→256) 全流程完成，结果已保存到: /root/Invertebrates_EC50_multi_fusion/early_fusion_TGP
