In [1]:
# ===== Notebook 1 – Cell 1: 依赖 & 工具函数 =====
from pathlib import Path
import json
import pickle

import numpy as np
import pandas as pd

from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

GLOBAL_SEED = 42

def compute_regression_metrics(y_true, y_pred):
    """计算回归指标：MAE, RMSE, R2, Pearson_r"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)

    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))

    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        pr = float("nan")
    else:
        pr, _ = pearsonr(y_true, y_pred)
        pr = float(pr)

    return {"MAE": mae, "RMSE": rmse, "R2": r2, "Pearson_r": pr}

def np_encoder(o):
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, np.ndarray):
        return o.tolist()
    raise TypeError(f"Type {type(o)} not serializable")


In [2]:
# ===== Notebook 1 – Cell 2: 路径 & 超参数网格 =====

ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")

# 文本 CLS 全量嵌入（与 df 行号对齐）
TEXT_DIR      = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_768  = TEXT_DIR / "reg_smiles_cls_embeddings_all.npy"

# 理化性质 MLP embedding（descMLP 输出）
PHY_DIR        = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_DIR / "emb_physchem_mlp_all.npy"
PHY_ROWID_PATH = PHY_DIR / "row_id_clean.npy"

# 原始表（带 SMILES / mgperL / Duration / Effect / Endpoint / desc）
DATA_PATH = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")

# 输出目录：Text + PhysChem 早期融合（简化版）
FUSION_ROOT_TP = ROOT_MULTI / "early(T+P)"
OUT_DIR_TP     = FUSION_ROOT_TP / "text_plus_physchem_simple"
MODELS_DIR_TP  = OUT_DIR_TP / "models"

for d in [FUSION_ROOT_TP, OUT_DIR_TP, MODELS_DIR_TP]:
    d.mkdir(parents=True, exist_ok=True)

# 列名（按你的 excel 调整）
SMILES_COL   = "SMILES_Canonical_RDKit"
EFFECT_COL   = "Effect"
ENDPOINT_COL = "Endpoint"
DURATION_COL = "Duration_Value(hour)"   # 如果不一样就改
LABEL_RAW    = "mgperL"
LABEL_LOG    = "mgperL_log"             # 我们统一在 log10(mg/L) 空间上训练
LABEL_COL    = LABEL_LOG

TEXT_DIM_TARGET = 256  # 768 -> 256

# RF 随机搜索空间（直接用 RandomizedSearchCV）
param_distributions_rf = {
    "n_estimators":      [200, 300, 500, 800, 1000],
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf":  [1, 2, 4],
    "max_features":      ["sqrt", "log2", 0.3, 0.5, 0.8],
}

print("TEXT_EMB_768 :", TEXT_EMB_768)
print("PHY_EMB_PATH :", PHY_EMB_PATH)
print("DATA_PATH    :", DATA_PATH)
print("OUT_DIR_TP   :", OUT_DIR_TP)


TEXT_EMB_768 : /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs/reg_smiles_cls_embeddings_all.npy
PHY_EMB_PATH : /root/Invertebrates_EC50_multi_fusion/phychem/physchem_mlp_rf_v2/emb_physchem_mlp_all.npy
DATA_PATH    : /root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx
OUT_DIR_TP   : /root/Invertebrates_EC50_multi_fusion/early(T+P)/text_plus_physchem_simple


In [3]:
# ===== Notebook 1 – Cell 3: 读取 df & 生成 mgperL_log =====

df = pd.read_excel(DATA_PATH, engine="openpyxl")

if "row_id" not in df.columns:
    df = df.reset_index().rename(columns={"index": "row_id"})
df["row_id"] = df["row_id"].astype(int)

# 标签：若没有 mgperL_log，则由 mgperL 生成
if LABEL_LOG not in df.columns:
    df[LABEL_RAW] = pd.to_numeric(df[LABEL_RAW], errors="coerce")
    mask_valid = df[LABEL_RAW] > 0
    df[LABEL_LOG] = np.where(mask_valid, np.log10(df[LABEL_RAW]), np.nan)

print("df 形状:", df.shape)
print(df[["row_id", SMILES_COL, LABEL_RAW, LABEL_LOG, DURATION_COL, EFFECT_COL, ENDPOINT_COL]].head())


df 形状: (3620, 36)
   row_id    SMILES_Canonical_RDKit  mgperL  mgperL_log  Duration_Value(hour)  \
0       0        [Cl-].[Cl-].[Zn+2]     1.3    0.113943                  96.0   
1       1  O=S(=O)([O-])[O-].[Zn+2]     2.5    0.397940                  24.0   
2       2        [Cl-].[Cl-].[Pb+2]    40.8    1.610660                  96.0   
3       3  O=S(=O)([O-])[O-].[Cu+2]     1.9    0.278754                  24.0   
4       4  O=S(=O)([O-])[O-].[Cu+2]     0.6   -0.221849                  96.0   

  Effect Endpoint  
0    ITX     EC50  
1    ITX     EC50  
2    ITX     EC50  
3    ITX     EC50  
4    ITX     EC50  


In [4]:
# ===== Notebook 1 – Cell 4: 加载 Text & PhysChem embedding，对齐构造特征 =====

# 1) 文本 CLS 全量嵌入（按 df 行号顺序）
text_all = np.load(TEXT_EMB_768)
assert text_all.shape[0] == len(df), "text_all 行数与 df 不一致，请检查。"
print("text_all 形状:", text_all.shape)

# 2) 理化性质 MLP embedding + row_id
phys_emb   = np.load(PHY_EMB_PATH)                   # (N_phys, d_phys)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)     # (N_phys,)
print("phys_emb 形状:", phys_emb.shape)
print("rowid_phys 范围:", rowid_phys.min(), "→", rowid_phys.max())

# 3) 对齐 Text+PhysChem 子集：按 rowid_phys 从 df / text_all 取子集
df_indexed = df.set_index("row_id")

meta_tp_list  = []
text_tp_list  = []
phys_tp_list  = []
y_tp_list     = []
rowid_tp_list = []

for rid, p_vec in zip(rowid_phys, phys_emb):
    if rid not in df_indexed.index:
        continue
    row_meta = df_indexed.loc[rid]
    if pd.isna(row_meta[LABEL_COL]) or not np.isfinite(row_meta[LABEL_COL]):
        continue

    meta_tp_list.append(row_meta)
    text_tp_list.append(text_all[rid])
    phys_tp_list.append(p_vec)
    y_tp_list.append(row_meta[LABEL_COL])
    rowid_tp_list.append(rid)

meta_tp  = pd.DataFrame(meta_tp_list).reset_index(drop=True)
text_tp  = np.stack(text_tp_list, axis=0)
phys_tp  = np.stack(phys_tp_list, axis=0)
y_tp     = np.array(y_tp_list, dtype=float)
rowid_tp = np.array(rowid_tp_list, dtype=int)

print("Text+PhysChem 对齐后样本数:", len(y_tp))
print("text_tp 形状:", text_tp.shape)
print("phys_tp 形状:", phys_tp.shape)

# 4) 过滤 Duration / Effect / Endpoint 缺失
mask_keep = (
    meta_tp[DURATION_COL].notna()
    & meta_tp[EFFECT_COL].notna()
    & meta_tp[ENDPOINT_COL].notna()
)

print("初始样本数:", len(meta_tp))
print("保留样本数:", int(mask_keep.sum()))

meta_tp  = meta_tp.loc[mask_keep].reset_index(drop=True)
text_tp  = text_tp[mask_keep.values]
phys_tp  = phys_tp[mask_keep.values]
y_tp     = y_tp[mask_keep.values]
rowid_tp = rowid_tp[mask_keep.values]

print("过滤后 text_tp 形状:", text_tp.shape)
print("过滤后 phys_tp 形状:", phys_tp.shape)
print("过滤后 y_tp 形状:", y_tp.shape)

# 5) One-hot: Effect + Endpoint（在整个 Text+PhysChem 子集上做一次，保证列一致）
cat_cols = [EFFECT_COL, ENDPOINT_COL]
cat_dummies_tp = pd.get_dummies(meta_tp[cat_cols], dummy_na=False)
cat_tp = cat_dummies_tp.values.astype(float)
cat_feature_names_tp = list(cat_dummies_tp.columns)

# 6) Duration 数值特征
meta_tp[DURATION_COL] = pd.to_numeric(meta_tp[DURATION_COL], errors="coerce")
dur_median = meta_tp[DURATION_COL].median()
meta_tp[DURATION_COL] = meta_tp[DURATION_COL].fillna(dur_median)
dur_tp = meta_tp[[DURATION_COL]].values.astype(float)
numeric_feature_names_tp = [DURATION_COL]

# 7) 分组用 SMILES
groups_tp = meta_tp[SMILES_COL].astype(str).values

print("cat_tp 形状:", cat_tp.shape)
print("dur_tp 形状:", dur_tp.shape)
print("样本总数:", len(y_tp))


text_all 形状: (3620, 768)
phys_emb 形状: (3406, 64)
rowid_phys 范围: 0 → 3619
Text+PhysChem 对齐后样本数: 3406
text_tp 形状: (3406, 768)
phys_tp 形状: (3406, 64)
初始样本数: 3406
保留样本数: 3406
过滤后 text_tp 形状: (3406, 768)
过滤后 phys_tp 形状: (3406, 64)
过滤后 y_tp 形状: (3406,)
cat_tp 形状: (3406, 3)
dur_tp 形状: (3406, 1)
样本总数: 3406


In [5]:
# ===== Notebook 1 – Cell 5: 8:2 划分 + 在 train80% 上构造最终特征 =====

N_tp = len(y_tp)
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=2025)

train_idx_tp, test_idx_tp = next(
    gss.split(np.zeros(N_tp), y_tp, groups=groups_tp)
)

train_idx_tp = np.array(train_idx_tp, dtype=np.int64)
test_idx_tp  = np.array(test_idx_tp, dtype=np.int64)

print("Text+PhysChem train 样本数:", len(train_idx_tp))
print("Text+PhysChem test  样本数:", len(test_idx_tp))

# 保存索引和 row_id，方便以后可视化或 AD
np.save(OUT_DIR_TP / "train_idx_tp.npy", train_idx_tp)
np.save(OUT_DIR_TP / "test_idx_tp.npy",  test_idx_tp)
np.save(OUT_DIR_TP / "row_id_train_tp.npy", rowid_tp[train_idx_tp])
np.save(OUT_DIR_TP / "row_id_test_tp.npy",  rowid_tp[test_idx_tp])

# ========= 只在 train80% 上拟合 scaler & SVD =========
text_train_raw = text_tp[train_idx_tp]   # (N_train, 768)
text_test_raw  = text_tp[test_idx_tp]

phys_train_raw = phys_tp[train_idx_tp]
phys_test_raw  = phys_tp[test_idx_tp]

dur_train_raw = dur_tp[train_idx_tp]
dur_test_raw  = dur_tp[test_idx_tp]

cat_train = cat_tp[train_idx_tp]
cat_test  = cat_tp[test_idx_tp]

y_train_tp = y_tp[train_idx_tp]
y_test_tp  = y_tp[test_idx_tp]
groups_train_tp = groups_tp[train_idx_tp]

# 1) 标准化
scaler_text = StandardScaler().fit(text_train_raw)
scaler_phys = StandardScaler().fit(phys_train_raw)
scaler_dur  = StandardScaler().fit(dur_train_raw)

text_train_std = scaler_text.transform(text_train_raw)
text_test_std  = scaler_text.transform(text_test_raw)

phys_train_std = scaler_phys.transform(phys_train_raw)
phys_test_std  = scaler_phys.transform(phys_test_raw)

dur_train_std  = scaler_dur.transform(dur_train_raw)
dur_test_std   = scaler_dur.transform(dur_test_raw)

# 2) 文本 SVD: 768 -> 256（只在 train80% 上拟合）
svd_text = TruncatedSVD(
    n_components=TEXT_DIM_TARGET,
    random_state=GLOBAL_SEED
)
text_train_256 = svd_text.fit_transform(text_train_std)
text_test_256  = svd_text.transform(text_test_std)

print("text_train_256 形状:", text_train_256.shape)

# 3) 拼接最终特征：[text_256, phys_std, dur_std, onehot]
X_train_tp = np.concatenate(
    [text_train_256, phys_train_std, dur_train_std, cat_train], axis=1
)
X_test_tp = np.concatenate(
    [text_test_256, phys_test_std, dur_test_std, cat_test], axis=1
)

print("X_train_tp 形状:", X_train_tp.shape)
print("X_test_tp  形状:", X_test_tp.shape)


Text+PhysChem train 样本数: 2716
Text+PhysChem test  样本数: 690
text_train_256 形状: (2716, 256)
X_train_tp 形状: (2716, 324)
X_test_tp  形状: (690, 324)


In [6]:
# ===== Text+PhysChem: 保存早期融合后的 embeddings =====

# 1) 用 train80% 拟合好的 scaler/SVD，对“所有样本” transform 一遍
text_all_std_tp = scaler_text.transform(text_tp)
phys_all_std_tp = scaler_phys.transform(phys_tp)
dur_all_std_tp  = scaler_dur.transform(dur_tp)

text_all_256_tp = svd_text.transform(text_all_std_tp)  # (N_all, 256)

# 2) 全体样本拼接：[text_256, phys_std, dur_std, onehot]
X_all_tp = np.concatenate(
    [text_all_256_tp, phys_all_std_tp, dur_all_std_tp, cat_tp],
    axis=1
)
print("X_all_tp 形状:", X_all_tp.shape)

# 3) 保存 train/test 特征和全体 fused embedding
np.save(OUT_DIR_TP / "X_train_tp.npy", X_train_tp)
np.save(OUT_DIR_TP / "X_test_tp.npy",  X_test_tp)
np.save(OUT_DIR_TP / "y_train_tp.npy", y_train_tp)
np.save(OUT_DIR_TP / "y_test_tp.npy",  y_test_tp)

np.save(OUT_DIR_TP / "X_all_tp_fused.npy", X_all_tp)
np.save(OUT_DIR_TP / "row_id_all_tp.npy",  rowid_tp)

print("✅ Text+PhysChem 早期融合 embeddings 已保存到:", OUT_DIR_TP)


X_all_tp 形状: (3406, 324)
✅ Text+PhysChem 早期融合 embeddings 已保存到: /root/Invertebrates_EC50_multi_fusion/early(T+P)/text_plus_physchem_simple


In [7]:
# ===== Notebook 1 – Cell 6: 十折 GroupKFold 调参 + 最终模型 & test 评估 =====

# 1) 定义 RF 基模型
rf_base = RandomForestRegressor(
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)

# 2) 十折 GroupKFold（只在 train80% 内）
cv_tp = GroupKFold(n_splits=10)

# 3) RandomizedSearchCV：以 neg MAE 作为优化目标（越大越好）
rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_distributions_rf,
    n_iter=30,
    scoring="neg_mean_absolute_error",
    cv=cv_tp,
    n_jobs=-1,
    random_state=GLOBAL_SEED,
    verbose=2,
)

print("开始在 train80% 上用十折 GroupKFold 调参 RF ...")
rf_search.fit(X_train_tp, y_train_tp, groups=groups_train_tp)

best_params_tp = rf_search.best_params_
best_score_tp  = rf_search.best_score_
print("\nText+PhysChem 最优超参 (基于 train80% 十折CV):")
print(best_params_tp)
print("最优CV分数 (neg MAE):", best_score_tp)

# 4) 用最优超参在整个 train80% 上重训一个最终模型
rf_final_tp = RandomForestRegressor(
    **best_params_tp,
    random_state=GLOBAL_SEED,
    n_jobs=-1,
)
rf_final_tp.fit(X_train_tp, y_train_tp)

# 5) 在 train80% 和 test20% 上预测与评估
y_train_pred_tp = rf_final_tp.predict(X_train_tp)
y_test_pred_tp  = rf_final_tp.predict(X_test_tp)

metrics_train_tp = compute_regression_metrics(y_train_tp, y_train_pred_tp)
metrics_test_tp  = compute_regression_metrics(y_test_tp,  y_test_pred_tp)

print("\n===== Text+PhysChem 最终 RF：train(80%) 指标 =====")
for k, v in metrics_train_tp.items():
    print(f"{k}: {v:.4f}")

print("\n===== Text+PhysChem 最终 RF：独立 test(20%) 指标 =====")
for k, v in metrics_test_tp.items():
    print(f"{k}: {v:.4f}")

# 6) 保存模型、变换器、预测结果和指标
FINAL_MODEL_TP_PATH   = OUT_DIR_TP / "rf_text_phys_8_2_final_simple.joblib"
METRICS_TP_JSON_PATH  = OUT_DIR_TP / "metrics_text_phys_rf_8_2_simple.json"

joblib.dump(
    {
        "model": rf_final_tp,
        "scaler_text": scaler_text,
        "svd_text": svd_text,
        "scaler_phys": scaler_phys,
        "scaler_dur": scaler_dur,
        "cat_feature_names": cat_feature_names_tp,
        "numeric_feature_names": numeric_feature_names_tp,
        "config": {
            "TEXT_DIM_TARGET": TEXT_DIM_TARGET,
            "GLOBAL_SEED": int(GLOBAL_SEED),
            "param_distributions": param_distributions_rf,
        },
    },
    FINAL_MODEL_TP_PATH
)

np.save(OUT_DIR_TP / "y_train_tp.npy",        y_train_tp)
np.save(OUT_DIR_TP / "y_train_pred_tp.npy",   y_train_pred_tp)
np.save(OUT_DIR_TP / "y_test_tp.npy",         y_test_tp)
np.save(OUT_DIR_TP / "y_test_pred_tp.npy",    y_test_pred_tp)

with open(METRICS_TP_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params_cv": best_params_tp,
            "best_score_cv_neg_mae": float(best_score_tp),
            "train_metrics": metrics_train_tp,
            "test_metrics": metrics_test_tp,
            "n_total": int(len(y_tp)),
            "n_train": int(len(y_train_tp)),
            "n_test": int(len(y_test_tp)),
        },
        f,
        ensure_ascii=False,
        indent=2,
        default=np_encoder,
    )

print("\n✅ Text+PhysChem 最终模型 & 指标已保存：")
print("   模型路径:", FINAL_MODEL_TP_PATH)
print("   指标路径:", METRICS_TP_JSON_PATH)


开始在 train80% 上用十折 GroupKFold 调参 RF ...
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=40, max_features=0.8, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=52.8min
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=16.3min
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=16.5min
[CV] END max_depth=10, max_features=0.8, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 7.8min
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time= 2.7min
[CV] END max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.1min
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time= 3.1min
[CV] END max_depth=20, max_features=log2, min_samples_leaf=