In [1]:
# ===== Cell 1: 依赖导入 & 工具函数 =====
from pathlib import Path
import json
import pickle

import numpy as np
import pandas as pd

from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

GLOBAL_SEED = 42

def compute_regression_metrics(y_true, y_pred):
    """计算回归指标：MAE, RMSE, R2, Pearson_r"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))

    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        pr = float("nan")
    else:
        pr, _ = pearsonr(y_true, y_pred)
        pr = float(pr)

    return {"MAE": mae, "RMSE": rmse, "R2": r2, "Pearson_r": pr}

def np_encoder(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")


In [2]:
# ===== Cell 2: 路径 & RF 超参空间 =====

ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# 文本 CLS 全量嵌入（与 df 行号对齐）
TEXT_DIR      = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_768  = TEXT_DIR / "reg_smiles_cls_embeddings_all.npy"

# 图端 embedding + row_id
GRAPH_DIR        = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_DIR / "reg_graph_embeddings.npy"
GRAPH_ROWID_PATH = GRAPH_DIR / "row_id_graph_for_emb.npy"

# 原始数据表（含 SMILES / mgperL / Duration / Effect / Endpoint）
DATA_PATH = Path("/root/fusion_dataset/Invertebrates_EC50_unique.xlsx")

# 输出目录：Text + Graph 早期融合（简化版）
FUSION_ROOT_TG = ROOT_MULTI / "early(T+G)"
OUT_DIR_TG     = FUSION_ROOT_TG / "text_plus_graph_simple"
MODELS_DIR_TG  = OUT_DIR_TG / "models"
for d in [FUSION_ROOT_TG, OUT_DIR_TG, MODELS_DIR_TG]:
    d.mkdir(parents=True, exist_ok=True)

# 列名（按你的表改）
SMILES_COL   = "SMILES_Canonical_RDKit"
EFFECT_COL   = "Effect"
ENDPOINT_COL = "Endpoint"
DURATION_COL = "Duration_Value(hour)"   # 如果不一样就改
LABEL_RAW    = "mgperL"
LABEL_LOG    = "mgperL_log"
LABEL_COL    = LABEL_LOG                # 想用原始 mg/L 就改成 LABEL_RAW

# 文本降维目标：768 -> 256
TEXT_DIM_TARGET = 256

# RF 随机搜索空间
param_distributions_rf = {
    "n_estimators":      [200, 300, 500, 800, 1000],
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4],
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

print("TEXT_EMB_768 :", TEXT_EMB_768)
print("GRAPH_EMB_PATH:", GRAPH_EMB_PATH)
print("DATA_PATH    :", DATA_PATH)
print("OUT_DIR_TG   :", OUT_DIR_TG)


TEXT_EMB_768 : /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs/reg_smiles_cls_embeddings_all.npy
GRAPH_EMB_PATH: /root/Invertebrates_EC50_multi_fusion/graph/graph_outputs/reg_graph_embeddings.npy
DATA_PATH    : /root/fusion_dataset/Invertebrates_EC50_unique.xlsx
OUT_DIR_TG   : /root/Invertebrates_EC50_multi_fusion/early(T+G)/text_plus_graph_simple


In [3]:
# ===== Cell 3: 读取 df & 构造 mgperL_log =====

df = pd.read_excel(DATA_PATH, engine="openpyxl")

# 保证有 row_id，与 embedding 一致
if "row_id" not in df.columns:
    df = df.reset_index().rename(columns={"index": "row_id"})
df["row_id"] = df["row_id"].astype(int)

# 标签：若没有 mgperL_log，则由 mgperL 生成
if LABEL_LOG not in df.columns:
    df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")
    mask_valid = df[LABEL_RAW] > 0
    df[LABEL_LOG] = np.where(mask_valid, np.log10(df[LABEL_RAW]), np.nan)

print("df 形状:", df.shape)
print(df[["row_id", SMILES_COL, LABEL_RAW, LABEL_LOG, DURATION_COL, EFFECT_COL, ENDPOINT_COL]].head())


df 形状: (3620, 12)
   row_id    SMILES_Canonical_RDKit  mgperL  mgperL_log  Duration_Value(hour)  \
0       0        [Cl-].[Cl-].[Zn+2]     1.3    0.113943                  96.0   
1       1  O=S(=O)([O-])[O-].[Zn+2]     2.5    0.397940                  24.0   
2       2        [Cl-].[Cl-].[Pb+2]    40.8    1.610660                  96.0   
3       3  O=S(=O)([O-])[O-].[Cu+2]     1.9    0.278754                  24.0   
4       4  O=S(=O)([O-])[O-].[Cu+2]     0.6   -0.221849                  96.0   

  Effect Endpoint  
0    ITX     EC50  
1    ITX     EC50  
2    ITX     EC50  
3    ITX     EC50  
4    ITX     EC50  


In [4]:
# ===== Cell 4: 加载 Text & Graph embedding，对齐 row_id =====

# 1) 文本 CLS 全量嵌入（按 df 行号顺序）
text_all = np.load(TEXT_EMB_768)
assert text_all.shape[0] == len(df), "text_all 行数与 df 不一致，请检查。"
print("text_all 形状:", text_all.shape)

# 2) 图端 embedding + row_id
graph_emb   = np.load(GRAPH_EMB_PATH)                # (N_graph, d_g)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)  # (N_graph,)
print("graph_emb 形状:", graph_emb.shape)
print("rowid_graph 范围:", rowid_graph.min(), "→", rowid_graph.max())

# 3) 按 row_id 交集对齐 Text+Graph 子集
df_indexed = df.set_index("row_id")

ids_graph = set(rowid_graph.tolist())
ids_all   = set(df_indexed.index.tolist())
ids_intersection = sorted(list(ids_graph & ids_all))

print("Text+Graph row_id 交集样本数:", len(ids_intersection))

# 建映射：row_id -> graph_emb 下标
idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}

meta_tg_list  = []
text_tg_list  = []
graph_tg_list = []
y_tg_list     = []
rowid_tg_list = []

for rid in ids_intersection:
    row_meta = df_indexed.loc[rid]
    label = row_meta[LABEL_COL]
    if pd.isna(label) or not np.isfinite(label):
        continue

    meta_tg_list.append(row_meta)
    text_tg_list.append(text_all[rid])              # 文本 CLS
    graph_tg_list.append(graph_emb[idx_map_graph[rid]])  # 图 embedding
    y_tg_list.append(label)
    rowid_tg_list.append(rid)

meta_tg  = pd.DataFrame(meta_tg_list).reset_index(drop=True)
text_tg  = np.stack(text_tg_list, axis=0)
graph_tg = np.stack(graph_tg_list, axis=0)
y_tg     = np.array(y_tg_list, dtype=float)
rowid_tg = np.array(rowid_tg_list, dtype=int)

print("对齐后样本数:", len(y_tg))
print("text_tg  形状:", text_tg.shape)
print("graph_tg 形状:", graph_tg.shape)
print("y_tg     形状:", y_tg.shape)


text_all 形状: (3620, 768)
graph_emb 形状: (3213, 256)
rowid_graph 范围: 1 → 3619
Text+Graph row_id 交集样本数: 3213
对齐后样本数: 3213
text_tg  形状: (3213, 768)
graph_tg 形状: (3213, 256)
y_tg     形状: (3213,)


In [5]:
# ===== Cell 5: 构造 Duration + Effect/Endpoint One-Hot + SMILES 分组 =====

meta2 = meta_tg.copy()

# 过滤：Duration / Effect / Endpoint 必须存在
mask_keep_tg = (
    meta2[DURATION_COL].notna()
    & meta2[EFFECT_COL].notna()
    & meta2[ENDPOINT_COL].notna()
)

print("初始样本数:", len(meta2))
print("保留样本数:", int(mask_keep_tg.sum()))

meta2    = meta2.loc[mask_keep_tg].reset_index(drop=True)
text_tg  = text_tg[mask_keep_tg.values]
graph_tg = graph_tg[mask_keep_tg.values]
y_tg     = y_tg[mask_keep_tg.values]
rowid_tg = rowid_tg[mask_keep_tg.values]

print("过滤后 text_tg  形状:", text_tg.shape)
print("过滤后 graph_tg 形状:", graph_tg.shape)
print("过滤后 y_tg    形状:", y_tg.shape)

# One-hot：Effect + Endpoint
cat_cols = [EFFECT_COL, ENDPOINT_COL]
cat_dummies_tg = pd.get_dummies(meta2[cat_cols], dummy_na=False)
cat_tg = cat_dummies_tg.values.astype(float)
cat_feature_names_tg = list(cat_dummies_tg.columns)

# Duration 数值特征
meta2[DURATION_COL] = pd.to_numeric(meta2[DURATION_COL], errors="coerce")
dur_median_tg = meta2[DURATION_COL].median()
meta2[DURATION_COL] = meta2[DURATION_COL].fillna(dur_median_tg)
dur_tg = meta2[[DURATION_COL]].values.astype(float)
numeric_feature_names_tg = [DURATION_COL]

# 分组 SMILES
groups_tg = meta2[SMILES_COL].astype(str).values

print("cat_tg 形状:", cat_tg.shape)
print("dur_tg 形状:", dur_tg.shape)
print("样本总数 N_tg:", len(y_tg))


初始样本数: 3213
保留样本数: 3213
过滤后 text_tg  形状: (3213, 768)
过滤后 graph_tg 形状: (3213, 256)
过滤后 y_tg    形状: (3213,)
cat_tg 形状: (3213, 3)
dur_tg 形状: (3213, 1)
样本总数 N_tg: 3213


In [6]:
# ===== Cell 6: 8:2 按 SMILES 分组划分 + 构造最终特征 =====

N_tg = len(y_tg)
gss_tg = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2025)

train_idx_tg, test_idx_tg = next(
    gss_tg.split(np.zeros(N_tg), y_tg, groups=groups_tg)
)

train_idx_tg = np.array(train_idx_tg, dtype=np.int64)
test_idx_tg  = np.array(test_idx_tg, dtype=np.int64)

print("Text+Graph train 样本数:", len(train_idx_tg))
print("Text+Graph test  样本数:", len(test_idx_tg))

# 保存索引和 row_id，方便后续分析
np.save(OUT_DIR_TG / "train_idx_tg.npy", train_idx_tg)
np.save(OUT_DIR_TG / "test_idx_tg.npy",  test_idx_tg)
np.save(OUT_DIR_TG / "row_id_train_tg.npy", rowid_tg[train_idx_tg])
np.save(OUT_DIR_TG / "row_id_test_tg.npy",  rowid_tg[test_idx_tg])

# ========== 构造 train/test 各模态原始特征 ==========
text_train_raw  = text_tg[train_idx_tg]
text_test_raw   = text_tg[test_idx_tg]

graph_train_raw = graph_tg[train_idx_tg]
graph_test_raw  = graph_tg[test_idx_tg]

dur_train_raw   = dur_tg[train_idx_tg]
dur_test_raw    = dur_tg[test_idx_tg]

cat_train       = cat_tg[train_idx_tg]
cat_test        = cat_tg[test_idx_tg]

y_train_tg      = y_tg[train_idx_tg]
y_test_tg       = y_tg[test_idx_tg]
groups_train_tg = groups_tg[train_idx_tg]

# ========== 只在 train80% 上拟合 scaler & SVD ==========
# 1) 标准化
scaler_text  = StandardScaler().fit(text_train_raw)
scaler_graph = StandardScaler().fit(graph_train_raw)
scaler_dur   = StandardScaler().fit(dur_train_raw)

text_train_std  = scaler_text.transform(text_train_raw)
text_test_std   = scaler_text.transform(text_test_raw)

graph_train_std = scaler_graph.transform(graph_train_raw)
graph_test_std  = scaler_graph.transform(graph_test_raw)

dur_train_std   = scaler_dur.transform(dur_train_raw)
dur_test_std    = scaler_dur.transform(dur_test_raw)

# 2) 文本降维：768 -> 256（只在 train80% 上 fit，一起 transform test）
svd_text = TruncatedSVD(
    n_components=TEXT_DIM_TARGET,
    random_state=GLOBAL_SEED
)
text_train_256 = svd_text.fit_transform(text_train_std)
text_test_256  = svd_text.transform(text_test_std)

print("text_train_256 形状:", text_train_256.shape)

# 3) 拼接最终特征：[text_256, graph_std, dur_std, onehot]
X_train_tg = np.concatenate(
    [text_train_256, graph_train_std, dur_train_std, cat_train], axis=1
)
X_test_tg = np.concatenate(
    [text_test_256, graph_test_std, dur_test_std, cat_test], axis=1
)

print("X_train_tg 形状:", X_train_tg.shape)
print("X_test_tg  形状:", X_test_tg.shape)


Text+Graph train 样本数: 2581
Text+Graph test  样本数: 632
text_train_256 形状: (2581, 256)
X_train_tg 形状: (2581, 516)
X_test_tg  形状: (632, 516)


In [7]:
# ===== Cell 7: 保存 Text+Graph 早期融合后的 embeddings =====

# 1) 用在 train80% 上拟合好的 scaler/SVD，对“所有样本”做一次 transform
text_all_std  = scaler_text.transform(text_tg)    # (N_all, 768) -> 标准化
graph_all_std = scaler_graph.transform(graph_tg)  # (N_all, d_g) -> 标准化
dur_all_std   = scaler_dur.transform(dur_tg)      # (N_all, 1)   -> 标准化

# 文本 768 -> 256，用的是刚才在 train80% 拟合好的 svd_text
text_all_256 = svd_text.transform(text_all_std)   # (N_all, 256)

# 2) 在“全体样本”上拼接早期融合特征：[text_256, graph_std, dur_std, onehot]
X_all_tg = np.concatenate(
    [text_all_256, graph_all_std, dur_all_std, cat_tg],
    axis=1
)
print("X_all_tg 形状:", X_all_tg.shape)

# 3) 把 train/test 的特征和“全体 fused embedding”都存下来
np.save(OUT_DIR_TG / "X_train_tg.npy", X_train_tg)
np.save(OUT_DIR_TG / "X_test_tg.npy",  X_test_tg)
np.save(OUT_DIR_TG / "y_train_tg.npy", y_train_tg)
np.save(OUT_DIR_TG / "y_test_tg.npy",  y_test_tg)

# 全部样本的 fused embedding + row_id，用于之后中期/晚期融合对齐
np.save(OUT_DIR_TG / "X_all_tg_fused.npy", X_all_tg)
np.save(OUT_DIR_TG / "row_id_all_tg.npy",  rowid_tg)

print("✅ Text+Graph 早期融合 embeddings 已保存到:", OUT_DIR_TG)


X_all_tg 形状: (3213, 516)
✅ Text+Graph 早期融合 embeddings 已保存到: /root/Invertebrates_EC50_multi_fusion/early(T+G)/text_plus_graph_simple


In [8]:
# ===== Cell 7: 十折 GroupKFold 调参 + 最终模型 & test 评估 =====

# 1) 定义 RF 基模型
rf_base_tg = RandomForestRegressor(
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)

# 2) 十折 GroupKFold（只在 train80% 内，按 SMILES 分组）
cv_tg = GroupKFold(n_splits=10)

# 3) RandomizedSearchCV：以 neg MAE 为优化目标
rf_search_tg = RandomizedSearchCV(
    estimator=rf_base_tg,
    param_distributions=param_distributions_rf,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    cv=cv_tg,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("开始在 Text+Graph train80% 上用十折 GroupKFold 调参 RF ...")
rf_search_tg.fit(X_train_tg, y_train_tg, groups=groups_train_tg)

best_params_tg = rf_search_tg.best_params_
best_score_tg  = rf_search_tg.best_score_

print("\nText+Graph 最优超参 (基于 train80% 十折 CV):")
print(best_params_tg)
print("最优CV分数 (neg MAE):", best_score_tg)

# 4) 用最优超参在整个 train80% 上重训最终模型
rf_final_tg = RandomForestRegressor(
    **best_params_tg,
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)
rf_final_tg.fit(X_train_tg, y_train_tg)

# 5) 在 train80% & test20% 上评估
y_train_pred_tg = rf_final_tg.predict(X_train_tg)
y_test_pred_tg  = rf_final_tg.predict(X_test_tg)

metrics_train_tg = compute_regression_metrics(y_train_tg, y_train_pred_tg)
metrics_test_tg  = compute_regression_metrics(y_test_tg,  y_test_pred_tg)

print("\n===== Text+Graph 最终 RF：train(80%) 指标 =====")
for k, v in metrics_train_tg.items():
    print(f"{k}: {v:.4f}")

print("\n===== Text+Graph 最终 RF：独立 test(20%) 指标 =====")
for k, v in metrics_test_tg.items():
    print(f"{k}: {v:.4f}")

# 6) 保存模型、变换器、预测和指标
FINAL_MODEL_TG_PATH  = OUT_DIR_TG / "rf_text_graph_8_2_final_simple.joblib"
METRICS_TG_JSON_PATH = OUT_DIR_TG / "metrics_text_graph_rf_8_2_simple.json"

joblib.dump(
    {
        "model": rf_final_tg,
        "scaler_text": scaler_text,
        "svd_text": svd_text,
        "scaler_graph": scaler_graph,
        "scaler_dur": scaler_dur,
        "cat_feature_names": cat_feature_names_tg,
        "numeric_feature_names": numeric_feature_names_tg,
        "config": {
            "TEXT_DIM_TARGET": TEXT_DIM_TARGET,
            "GLOBAL_SEED": int(GLOBAL_SEED),
            "param_distributions": param_distributions_rf,
        },
    },
    FINAL_MODEL_TG_PATH
)

np.save(OUT_DIR_TG / "y_train_tg.npy",      y_train_tg)
np.save(OUT_DIR_TG / "y_train_pred_tg.npy", y_train_pred_tg)
np.save(OUT_DIR_TG / "y_test_tg.npy",       y_test_tg)
np.save(OUT_DIR_TG / "y_test_pred_tg.npy",  y_test_pred_tg)

with open(METRICS_TG_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params_cv": best_params_tg,
            "best_score_cv_neg_mae": float(best_score_tg),
            "train_metrics": metrics_train_tg,
            "test_metrics": metrics_test_tg,
            "n_total": int(len(y_tg)),
            "n_train": int(len(y_train_tg)),
            "n_test": int(len(y_test_tg)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ Text+Graph 最终模型 & 指标已保存：")
print("   模型路径:", FINAL_MODEL_TG_PATH)
print("   指标路径:", METRICS_TG_JSON_PATH)


开始在 Text+Graph train80% 上用十折 GroupKFold 调参 RF ...
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=40, max_features=0.8, min_samples_leaf=2, min_samples_split=5, n_estimators=800; total time=66.3min
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time= 3.8min
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=1000; total time= 4.1min
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=12.1min
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=22.3min
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 1.7min
[CV] END max_depth=40, max_features=0.8, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=49.1min
[CV] END max_depth=10, max_features=0.5, min_s