In [1]:
# ===== Notebook 1 – Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import json
import pickle

import numpy as np
import pandas as pd

from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

GLOBAL_SEED = 42

def compute_regression_metrics(y_true, y_pred):
    """计算回归指标：MAE, RMSE, R2, Pearson_r"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))

    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        pr = float("nan")
    else:
        pr, _ = pearsonr(y_true, y_pred)
        pr = float(pr)

    return {"MAE": mae, "RMSE": rmse, "R2": r2, "Pearson_r": pr}

def np_encoder(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")


In [2]:
# ===== Notebook 2 – Cell 2: 路径 & 超参数网格 =====

ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# 图端 embedding（来自 GRAPH.ipynb）
GRAPH_DIR        = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_DIR / "reg_graph_embeddings.npy"
GRAPH_ROWID_PATH = GRAPH_DIR / "row_id_graph_for_emb.npy"

# 理化性质 MLP embedding（descMLP 输出）
PHY_DIR        = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_DIR / "emb_physchem_mlp_all.npy"
PHY_ROWID_PATH = PHY_DIR / "row_id_clean.npy"

# 原始数据表
DATA_PATH = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")

# 输出目录：Graph + PhysChem 早期融合（简化版）
FUSION_ROOT_GP = ROOT_MULTI / "early(G+P)"
OUT_DIR_GP     = FUSION_ROOT_GP / "graph_plus_physchem_simple"
MODELS_DIR_GP  = OUT_DIR_GP / "models"

for d in [FUSION_ROOT_GP, OUT_DIR_GP, MODELS_DIR_GP]:
    d.mkdir(parents=True, exist_ok=True)

# 列名
SMILES_COL   = "SMILES_Canonical_RDKit"
EFFECT_COL   = "Effect"
ENDPOINT_COL = "Endpoint"
DURATION_COL = "Duration_Value(hour)"
LABEL_RAW    = "mgperL"
LABEL_LOG    = "mgperL_log"
LABEL_COL    = LABEL_LOG  # 想用原始 mgperL 就改成 LABEL_RAW

# RF 超参空间（跟 Text 那边保持一致）
param_distributions_rf = {
    "n_estimators":      [200, 300, 500, 800, 1000],
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4],
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

print("GRAPH_EMB_PATH:", GRAPH_EMB_PATH)
print("PHY_EMB_PATH  :", PHY_EMB_PATH)
print("DATA_PATH     :", DATA_PATH)
print("OUT_DIR_GP    :", OUT_DIR_GP)


GRAPH_EMB_PATH: /root/Invertebrates_EC50_multi_fusion/graph/graph_outputs/reg_graph_embeddings.npy
PHY_EMB_PATH  : /root/Invertebrates_EC50_multi_fusion/phychem/physchem_mlp_rf_v2/emb_physchem_mlp_all.npy
DATA_PATH     : /root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx
OUT_DIR_GP    : /root/Invertebrates_EC50_multi_fusion/early(G+P)/graph_plus_physchem_simple


In [3]:
# ===== Notebook 2 – Cell 3: 读取 df & 生成标签列 =====

df = pd.read_excel(DATA_PATH, engine="openpyxl")

if "row_id" not in df.columns:
    df = df.reset_index().rename(columns={"index": "row_id"})
df["row_id"] = df["row_id"].astype(int)

if LABEL_LOG not in df.columns:
    df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")
    mask_valid = df[LABEL_RAW] > 0
    df[LABEL_LOG] = np.where(mask_valid, np.log10(df[LABEL_RAW]), np.nan)

print("df 形状:", df.shape)
print(df[["row_id", SMILES_COL, LABEL_RAW, LABEL_LOG, DURATION_COL, EFFECT_COL, ENDPOINT_COL]].head())


df 形状: (3620, 36)
   row_id    SMILES_Canonical_RDKit  mgperL  mgperL_log  Duration_Value(hour)  \
0       0        [Cl-].[Cl-].[Zn+2]     1.3    0.113943                  96.0   
1       1  O=S(=O)([O-])[O-].[Zn+2]     2.5    0.397940                  24.0   
2       2        [Cl-].[Cl-].[Pb+2]    40.8    1.610660                  96.0   
3       3  O=S(=O)([O-])[O-].[Cu+2]     1.9    0.278754                  24.0   
4       4  O=S(=O)([O-])[O-].[Cu+2]     0.6   -0.221849                  96.0   

  Effect Endpoint  
0    ITX     EC50  
1    ITX     EC50  
2    ITX     EC50  
3    ITX     EC50  
4    ITX     EC50  


In [4]:
# ===== Notebook 2 – Cell 4: 加载 Graph & PhysChem，对齐构造特征 =====

# 图端
graph_emb   = np.load(GRAPH_EMB_PATH)                # (N_graph, d_g)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)  # (N_graph,)
print("graph_emb 形状:", graph_emb.shape)
print("rowid_graph 范围:", rowid_graph.min(), "→", rowid_graph.max())

# 理化性质 MLP
phys_emb   = np.load(PHY_EMB_PATH)                   # (N_phys, d_phys)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)     # (N_phys,)
print("phys_emb 形状:", phys_emb.shape)
print("rowid_phys 范围:", rowid_phys.min(), "→", rowid_phys.max())

# 交集 row_id
df_indexed = df.set_index("row_id")

ids_graph = set(rowid_graph.tolist())
ids_phys  = set(rowid_phys.tolist())
ids_intersection = sorted(list(ids_graph & ids_phys & set(df_indexed.index.tolist())))

print("Graph+PhysChem row_id 交集样本数:", len(ids_intersection))

# 建映射
idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}
idx_map_phys  = {rid: i for i, rid in enumerate(rowid_phys)}

meta_gp_list  = []
graph_gp_list = []
phys_gp_list  = []
y_gp_list     = []
rowid_gp_list = []

for rid in ids_intersection:
    row_meta = df_indexed.loc[rid]
    if pd.isna(row_meta[LABEL_COL]) or not np.isfinite(row_meta[LABEL_COL]):
        continue

    meta_gp_list.append(row_meta)
    graph_gp_list.append(graph_emb[idx_map_graph[rid]])
    phys_gp_list.append(phys_emb[idx_map_phys[rid]])
    y_gp_list.append(row_meta[LABEL_COL])
    rowid_gp_list.append(rid)

meta_gp  = pd.DataFrame(meta_gp_list).reset_index(drop=True)
graph_gp = np.stack(graph_gp_list, axis=0)
phys_gp  = np.stack(phys_gp_list, axis=0)
y_gp     = np.array(y_gp_list, dtype=float)
rowid_gp = np.array(rowid_gp_list, dtype=int)

print("对齐后样本数:", len(y_gp))
print("graph_gp 形状:", graph_gp.shape)
print("phys_gp  形状:", phys_gp.shape)
print("y_gp     形状:", y_gp.shape)

# 过滤 Duration / Effect / Endpoint 缺失
mask_keep_gp = (
    meta_gp[DURATION_COL].notna()
    & meta_gp[EFFECT_COL].notna()
    & meta_gp[ENDPOINT_COL].notna()
)

print("初始样本数:", len(meta_gp))
print("保留样本数:", int(mask_keep_gp.sum()))

meta_gp  = meta_gp.loc[mask_keep_gp].reset_index(drop=True)
graph_gp = graph_gp[mask_keep_gp.values]
phys_gp  = phys_gp[mask_keep_gp.values]
y_gp     = y_gp[mask_keep_gp.values]
rowid_gp = rowid_gp[mask_keep_gp.values]

print("过滤后 graph_gp 形状:", graph_gp.shape)
print("过滤后 phys_gp  形状:", phys_gp.shape)
print("过滤后 y_gp     形状:", y_gp.shape)

# One-hot: Effect + Endpoint
cat_cols = [EFFECT_COL, ENDPOINT_COL]
cat_dummies_gp = pd.get_dummies(meta_gp[cat_cols], dummy_na=False)
cat_gp = cat_dummies_gp.values.astype(float)
cat_feature_names_gp = list(cat_dummies_gp.columns)

# Duration 数值特征
meta_gp[DURATION_COL] = pd.to_numeric(meta_gp[DURATION_COL], errors="coerce")
dur_median_gp = meta_gp[DURATION_COL].median()
meta_gp[DURATION_COL] = meta_gp[DURATION_COL].fillna(dur_median_gp)
dur_gp = meta_gp[[DURATION_COL]].values.astype(float)
numeric_feature_names_gp = [DURATION_COL]

# 分组 SMILES
groups_gp = meta_gp[SMILES_COL].astype(str).values

print("cat_gp 形状:", cat_gp.shape)
print("dur_gp 形状:", dur_gp.shape)
print("样本总数 N_gp:", len(y_gp))


graph_emb 形状: (3213, 256)
rowid_graph 范围: 1 → 3619
phys_emb 形状: (3406, 64)
rowid_phys 范围: 0 → 3619
Graph+PhysChem row_id 交集样本数: 3103
对齐后样本数: 3103
graph_gp 形状: (3103, 256)
phys_gp  形状: (3103, 64)
y_gp     形状: (3103,)
初始样本数: 3103
保留样本数: 3103
过滤后 graph_gp 形状: (3103, 256)
过滤后 phys_gp  形状: (3103, 64)
过滤后 y_gp     形状: (3103,)
cat_gp 形状: (3103, 3)
dur_gp 形状: (3103, 1)
样本总数 N_gp: 3103


In [5]:
# ===== Notebook 2 – Cell 5: 8:2 划分 + 构造最终特征 =====

N_gp = len(y_gp)
gss_gp = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2025)

train_idx_gp, test_idx_gp = next(
    gss_gp.split(np.zeros(N_gp), y_gp, groups=groups_gp)
)

train_idx_gp = np.array(train_idx_gp, dtype=np.int64)
test_idx_gp  = np.array(test_idx_gp, dtype=np.int64)

print("Graph+PhysChem train 样本数:", len(train_idx_gp))
print("Graph+PhysChem test  样本数:", len(test_idx_gp))

np.save(OUT_DIR_GP / "train_idx_gp.npy", train_idx_gp)
np.save(OUT_DIR_GP / "test_idx_gp.npy",  test_idx_gp)
np.save(OUT_DIR_GP / "row_id_train_gp.npy", rowid_gp[train_idx_gp])
np.save(OUT_DIR_GP / "row_id_test_gp.npy",  rowid_gp[test_idx_gp])

# 构造 train / test 特征
graph_train_raw = graph_gp[train_idx_gp]
graph_test_raw  = graph_gp[test_idx_gp]

phys_train_raw  = phys_gp[train_idx_gp]
phys_test_raw   = phys_gp[test_idx_gp]

dur_train_raw = dur_gp[train_idx_gp]
dur_test_raw  = dur_gp[test_idx_gp]

cat_train = cat_gp[train_idx_gp]
cat_test  = cat_gp[test_idx_gp]

y_train_gp = y_gp[train_idx_gp]
y_test_gp  = y_gp[test_idx_gp]
groups_train_gp = groups_gp[train_idx_gp]

# 标准化（只在 train80% 上拟合）
scaler_graph = StandardScaler().fit(graph_train_raw)
scaler_phys  = StandardScaler().fit(phys_train_raw)
scaler_dur   = StandardScaler().fit(dur_train_raw)

graph_train_std = scaler_graph.transform(graph_train_raw)
graph_test_std  = scaler_graph.transform(graph_test_raw)

phys_train_std = scaler_phys.transform(phys_train_raw)
phys_test_std  = scaler_phys.transform(phys_test_raw)

dur_train_std  = scaler_dur.transform(dur_train_raw)
dur_test_std   = scaler_dur.transform(dur_test_raw)

# 拼接最终特征：[graph_std, phys_std, dur_std, onehot]
X_train_gp = np.concatenate(
    [graph_train_std, phys_train_std, dur_train_std, cat_train], axis=1
)
X_test_gp = np.concatenate(
    [graph_test_std, phys_test_std, dur_test_std, cat_test], axis=1
)

print("X_train_gp 形状:", X_train_gp.shape)
print("X_test_gp  形状:", X_test_gp.shape)


Graph+PhysChem train 样本数: 2477
Graph+PhysChem test  样本数: 626
X_train_gp 形状: (2477, 324)
X_test_gp  形状: (626, 324)


In [6]:
# ===== Graph+PhysChem: 保存早期融合后的 embeddings =====

# 1) 用 train80% 拟合好的 scaler，对全体样本 transform
graph_all_std_gp = scaler_graph.transform(graph_gp)
phys_all_std_gp  = scaler_phys.transform(phys_gp)
dur_all_std_gp   = scaler_dur.transform(dur_gp)

# 2) 全体样本拼接：[graph_std, phys_std, dur_std, onehot]
X_all_gp = np.concatenate(
    [graph_all_std_gp, phys_all_std_gp, dur_all_std_gp, cat_gp],
    axis=1
)
print("X_all_gp 形状:", X_all_gp.shape)

# 3) 保存 train/test 特征和全体 fused embedding
np.save(OUT_DIR_GP / "X_train_gp.npy", X_train_gp)
np.save(OUT_DIR_GP / "X_test_gp.npy",  X_test_gp)
np.save(OUT_DIR_GP / "y_train_gp.npy", y_train_gp)
np.save(OUT_DIR_GP / "y_test_gp.npy",  y_test_gp)

np.save(OUT_DIR_GP / "X_all_gp_fused.npy", X_all_gp)
np.save(OUT_DIR_GP / "row_id_all_gp.npy",  rowid_gp)

print("✅ Graph+PhysChem 早期融合 embeddings 已保存到:", OUT_DIR_GP)


X_all_gp 形状: (3103, 324)
✅ Graph+PhysChem 早期融合 embeddings 已保存到: /root/Invertebrates_EC50_multi_fusion/early(G+P)/graph_plus_physchem_simple


In [7]:
# ===== Notebook 2 – Cell 6: 十折 GroupKFold 调参 + 最终模型 & test 评估 =====

rf_base_gp = RandomForestRegressor(
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)

cv_gp = GroupKFold(n_splits=10)

rf_search_gp = RandomizedSearchCV(
    estimator=rf_base_gp,
    param_distributions=param_distributions_rf,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    cv=cv_gp,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("开始在 Graph+PhysChem train80% 上用十折 GroupKFold 调参 RF ...")
rf_search_gp.fit(X_train_gp, y_train_gp, groups=groups_train_gp)

best_params_gp = rf_search_gp.best_params_
best_score_gp  = rf_search_gp.best_score_

print("\nGraph+PhysChem 最优超参 (基于 train80% 十折CV):")
print(best_params_gp)
print("最优CV分数 (neg MAE):", best_score_gp)

# 最终模型：在整个 train80% 上重训
rf_final_gp = RandomForestRegressor(
    **best_params_gp,
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)
rf_final_gp.fit(X_train_gp, y_train_gp)

y_train_pred_gp = rf_final_gp.predict(X_train_gp)
y_test_pred_gp  = rf_final_gp.predict(X_test_gp)

metrics_train_gp = compute_regression_metrics(y_train_gp, y_train_pred_gp)
metrics_test_gp  = compute_regression_metrics(y_test_gp,  y_test_pred_gp)

print("\n===== Graph+PhysChem 最终 RF：train(80%) 指标 =====")
for k, v in metrics_train_gp.items():
    print(f"{k}: {v:.4f}")

print("\n===== Graph+PhysChem 最终 RF：独立 test(20%) 指标 =====")
for k, v in metrics_test_gp.items():
    print(f"{k}: {v:.4f}")

FINAL_MODEL_GP_PATH   = OUT_DIR_GP / "rf_graph_phys_8_2_final_simple.joblib"
METRICS_GP_JSON_PATH  = OUT_DIR_GP / "metrics_graph_phys_rf_8_2_simple.json"

joblib.dump(
    {
        "model": rf_final_gp,
        "scaler_graph": scaler_graph,
        "scaler_phys":  scaler_phys,
        "scaler_dur":   scaler_dur,
        "cat_feature_names": cat_feature_names_gp,
        "numeric_feature_names": numeric_feature_names_gp,
        "config": {
            "GLOBAL_SEED": int(GLOBAL_SEED),
            "param_distributions": param_distributions_rf,
        },
    },
    FINAL_MODEL_GP_PATH
)

np.save(OUT_DIR_GP / "y_train_gp.npy",        y_train_gp)
np.save(OUT_DIR_GP / "y_train_pred_gp.npy",   y_train_pred_gp)
np.save(OUT_DIR_GP / "y_test_gp.npy",         y_test_gp)
np.save(OUT_DIR_GP / "y_test_pred_gp.npy",    y_test_pred_gp)

with open(METRICS_GP_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params_cv": best_params_gp,
            "best_score_cv_neg_mae": float(best_score_gp),
            "train_metrics": metrics_train_gp,
            "test_metrics": metrics_test_gp,
            "n_total": int(len(y_gp)),
            "n_train": int(len(y_train_gp)),
            "n_test": int(len(y_test_gp)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ Graph+PhysChem 最终模型 & 指标已保存：")
print("   模型路径:", FINAL_MODEL_GP_PATH)
print("   指标路径:", METRICS_GP_JSON_PATH)


开始在 Graph+PhysChem train80% 上用十折 GroupKFold 调参 RF ...
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=40, max_features=0.8, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=33.3min
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=15.9min
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=17.8min
[CV] END max_depth=10, max_features=0.8, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 9.1min
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time= 3.1min
[CV] END max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.0min
[CV] END max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.2min
[CV] END max_depth=20, max_features=log2, min