In [1]:
# ===== AD / Topk-ADSAL 公共函数 =====
from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# 你的项目根目录（来自你 6 个脚本）
ROOT_MULTI = Path("/root/Invertebrates_EC50_multi_fusion")
DATA_PATH  = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")
SMILES_COL = "SMILES_Canonical_RDKit"
LABEL_RAW  = "mgperL"
LABEL_LOG  = "mgperL_log"

# AD 输出目录（统一放这里）
AD_OUT_DIR = ROOT_MULTI / "AD_6mods_TopkADSAL"
AD_OUT_DIR.mkdir(parents=True, exist_ok=True)
print("AD 输出目录:", AD_OUT_DIR)

def _safe_read_json(p: Path):
    if p is None or (not p.exists()):
        return None
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def _metrics(y_true, y_pred):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    return {
        "r2":   float(r2_score(y_true, y_pred)) if len(y_true) else np.nan,
        "mae":  float(mean_absolute_error(y_true, y_pred)) if len(y_true) else np.nan,
        "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))) if len(y_true) else np.nan,
        "n":    int(len(y_true)),
    }

def compute_rho_ia(ref_X, ref_y, query_X, k=20, exclude_self=False):
    """
    cosine 距离 -> 相似度 w = 1 - dist
    ρs = sum(w)
    IA = sqrt( sum(w*(y - ybar)^2) / sum(w) )
    """
    ref_X   = np.asarray(ref_X, float)
    ref_y   = np.asarray(ref_y, float).reshape(-1)
    query_X = np.asarray(query_X, float)

    n_neighbors = k + 1 if exclude_self else k
    nn = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
    nn.fit(ref_X)
    dists, idxs = nn.kneighbors(query_X, return_distance=True)

    if exclude_self:
        dists = dists[:, 1:]
        idxs  = idxs[:, 1:]

    w = 1.0 - dists
    w = np.clip(w, 0.0, None)

    rho = w.sum(axis=1)

    ia = np.empty(len(query_X), dtype=float)
    for i in range(len(query_X)):
        wi = w[i]
        yi = ref_y[idxs[i]]
        s  = wi.sum()
        if s <= 0:
            ia[i] = np.nan
            continue
        ybar = float(np.dot(wi, yi) / s)
        var  = float(np.dot(wi, (yi - ybar) ** 2) / s)
        ia[i] = np.sqrt(var)
    return rho, ia

def run_ad_5fold(
    X, y, groups,
    rf_params=None,
    k=20, coverage=0.90,
    name="mod",
    extra_fold_transform=None,  # 例如 Early 端：每折 PCA
):
    """
    - 5 折 GroupKFold
    - 每折：fit RF -> val_pred；在 train 上算 rho/ia 的阈值；在 val 上判 AD；分别算 AD-in/out 性能
    - 返回：每样本结果 df + 汇总 dict，并落盘 csv/json
    """
    X = np.asarray(X)
    y = np.asarray(y).reshape(-1)
    groups = np.asarray(groups)

    cv = GroupKFold(n_splits=5)

    # RF 默认参数（你也可以覆盖）
    base_params = dict(
        n_estimators=600,
        max_features="sqrt",
        n_jobs=-1,
        random_state=2025,
    )
    if isinstance(rf_params, dict):
        base_params.update(rf_params)

    rows = []
    fold_summ = []

    for fold, (tr, va) in enumerate(cv.split(X, y, groups), 1):
        X_tr, y_tr = X[tr], y[tr]
        X_va, y_va = X[va], y[va]

        # 可选：每折对特征做 transform（比如 PCA）
        if callable(extra_fold_transform):
            X_tr, X_va = extra_fold_transform(X_tr, X_va, tr, va)

        rf = RandomForestRegressor(**base_params)
        rf.fit(X_tr, y_tr)
        pred_va = rf.predict(X_va)

        # 训练折：算 rho/ia 并取阈值（只用 train 信息！）
        rho_tr, ia_tr = compute_rho_ia(X_tr, y_tr, X_tr, k=k, exclude_self=True)
        rho_tr_valid = rho_tr[np.isfinite(rho_tr)]
        ia_tr_valid  = ia_tr[np.isfinite(ia_tr)]

        tau_rho = float(np.quantile(rho_tr_valid, 1.0 - coverage)) if rho_tr_valid.size else 0.0
        tau_ia  = float(np.quantile(ia_tr_valid,  coverage))      if ia_tr_valid.size else np.inf

        # 验证折：算 rho/ia 并判 AD
        rho_va, ia_va = compute_rho_ia(X_tr, y_tr, X_va, k=k, exclude_self=False)
        ad_in = (rho_va >= tau_rho) & (ia_va <= tau_ia)

        df_fold = pd.DataFrame({
            "name": name,
            "fold": fold,
            "idx":  va.astype(int),
            "y_true": y_va,
            "y_pred": pred_va,
            "rho": rho_va,
            "IA":  ia_va,
            "tau_rho": tau_rho,
            "tau_IA":  tau_ia,
            "AD_in": ad_in.astype(bool),
        })
        rows.append(df_fold)

        m_all = _metrics(y_va, pred_va)
        m_in  = _metrics(y_va[ad_in], pred_va[ad_in])
        m_out = _metrics(y_va[~ad_in], pred_va[~ad_in])

        fold_summ.append({
            "fold": fold,
            "coverage_val": float(ad_in.mean()),
            "all": m_all,
            "in":  m_in,
            "out": m_out,
            "tau_rho": tau_rho,
            "tau_IA":  tau_ia,
        })

        print(f"[{name}] fold={fold} | cov={ad_in.mean():.3f} | R2(all/in/out)={m_all['r2']:.3f}/{m_in['r2']:.3f}/{m_out['r2']:.3f}")

    df = pd.concat(rows, ignore_index=True)

    # 汇总
    def _mean_of(path):  # path like ("all","r2")
        vals=[]
        for f in fold_summ:
            x = f[path[0]].get(path[1], np.nan)
            if np.isfinite(x): vals.append(float(x))
        return float(np.mean(vals)) if vals else np.nan

    summary = {
        "name": name,
        "k": int(k),
        "coverage_train_target": float(coverage),
        "mean_coverage_val": float(np.mean([f["coverage_val"] for f in fold_summ])),
        "mean_r2_all":  _mean_of(("all","r2")),
        "mean_r2_in":   _mean_of(("in","r2")),
        "mean_r2_out":  _mean_of(("out","r2")),
        "mean_mae_all": _mean_of(("all","mae")),
        "mean_mae_in":  _mean_of(("in","mae")),
        "mean_mae_out": _mean_of(("out","mae")),
        "mean_rmse_all": _mean_of(("all","rmse")),
        "mean_rmse_in":  _mean_of(("in","rmse")),
        "mean_rmse_out": _mean_of(("out","rmse")),
        "folds": fold_summ,
        "rf_params_used": base_params,
    }

    # 落盘
    df.to_csv(AD_OUT_DIR / f"AD_{name}_per_sample.csv", index=False)
    with open(AD_OUT_DIR / f"AD_{name}_summary.json", "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    print(f"✅ saved: {AD_OUT_DIR / f'AD_{name}_per_sample.csv'}")
    print(f"✅ saved: {AD_OUT_DIR / f'AD_{name}_summary.json'}")
    return df, summary

# 读一次 df（给 Text/Graph/Phys/Late 用）
df_all = pd.read_excel(DATA_PATH, engine="openpyxl").reset_index(drop=True)
df_all[LABEL_RAW] = pd.to_numeric(df_all[LABEL_RAW], errors="coerce")
mask_pos = df_all[LABEL_RAW] > 0
df_all[LABEL_LOG] = np.where(mask_pos, np.log10(df_all[LABEL_RAW]), np.nan)
groups_all_full = df_all[SMILES_COL].astype(str).values

print("df shape:", df_all.shape, "| valid y:", np.isfinite(df_all[LABEL_LOG].values).sum())


AD 输出目录: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL
df shape: (3620, 35) | valid y: 3620


In [2]:
# ===== Text 端 AD =====
SMILES_OUT_DIR = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_PATH  = SMILES_OUT_DIR / "reg_smiles_cls_embeddings_all.npy"
METRICS_PATH_T = SMILES_OUT_DIR / "rf_text_pipeline_metrics.json"

X_text_all = np.load(TEXT_EMB_PATH)  # (N_df, 768)
y_full = df_all[LABEL_LOG].values

mask = np.isfinite(y_full)
X = X_text_all[mask]
y = y_full[mask]
groups = groups_all_full[mask]

metrics_T = _safe_read_json(METRICS_PATH_T)
best_params_T = None
if isinstance(metrics_T, dict):
    # 你脚本里是 metrics_rf["rf"]["best_params"] 这类结构，这里做容错
    if "best_params" in metrics_T: best_params_T = metrics_T["best_params"]
    elif "rf" in metrics_T and "best_params" in metrics_T["rf"]: best_params_T = metrics_T["rf"]["best_params"]
    elif "rf" in metrics_T and "best_params" in metrics_T["rf"].get("best_params", {}): best_params_T = metrics_T["rf"]["best_params"]

print("Text X/y/groups:", X.shape, y.shape, groups.shape)
print("Text best_params:", best_params_T)

df_text_ad, sum_text_ad = run_ad_5fold(
    X, y, groups,
    rf_params=best_params_T,
    k=20, coverage=0.90,
    name="TEXT",
)


Text X/y/groups: (3620, 768) (3620,) (3620,)
Text best_params: None
[TEXT] fold=1 | cov=0.836 | R2(all/in/out)=0.457/0.371/0.645
[TEXT] fold=2 | cov=0.827 | R2(all/in/out)=0.446/0.414/0.362
[TEXT] fold=3 | cov=0.818 | R2(all/in/out)=0.608/0.486/0.713
[TEXT] fold=4 | cov=0.827 | R2(all/in/out)=0.474/0.394/0.501
[TEXT] fold=5 | cov=0.807 | R2(all/in/out)=0.520/0.508/0.474
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_TEXT_per_sample.csv
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_TEXT_summary.json


In [3]:
# ===== Graph 端 AD =====
GRAPH_OUT_DIR    = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH   = GRAPH_OUT_DIR / "reg_graph_embeddings.npy"
GRAPH_ROWID_PATH = GRAPH_OUT_DIR / "row_id_graph_for_emb.npy"
PARAMS_PATH_G    = GRAPH_OUT_DIR / "rf_graph_best_params.json"

X_graph_raw = np.load(GRAPH_EMB_PATH)                 # (N_graph, d)
rowid_graph = np.load(GRAPH_ROWID_PATH).astype(int)   # df 行号

y_full = df_all[LABEL_LOG].values
mask_y = np.isfinite(y_full)

# 对齐：只保留有标签的 rowid
keep = mask_y[rowid_graph]
X = X_graph_raw[keep]
y = y_full[rowid_graph[keep]]
groups = groups_all_full[rowid_graph[keep]]

best_params_G = _safe_read_json(PARAMS_PATH_G)

print("Graph X/y/groups:", X.shape, y.shape, groups.shape)
print("Graph best_params:", best_params_G)

df_graph_ad, sum_graph_ad = run_ad_5fold(
    X, y, groups,
    rf_params=best_params_G,
    k=20, coverage=0.90,
    name="GRAPH",
)


Graph X/y/groups: (3213, 256) (3213,) (3213,)
Graph best_params: {'max_depth': 30, 'max_features': 0.3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 975}
[GRAPH] fold=1 | cov=0.778 | R2(all/in/out)=0.401/0.438/0.262
[GRAPH] fold=2 | cov=0.779 | R2(all/in/out)=0.531/0.471/0.512
[GRAPH] fold=3 | cov=0.760 | R2(all/in/out)=0.507/0.560/0.233
[GRAPH] fold=4 | cov=0.769 | R2(all/in/out)=0.504/0.487/0.373
[GRAPH] fold=5 | cov=0.808 | R2(all/in/out)=0.435/0.470/0.257
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_GRAPH_per_sample.csv
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_GRAPH_summary.json


In [4]:
# ===== PhysChem 端 AD =====
PHY_OUT_DIR   = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH  = PHY_OUT_DIR / "emb_physchem_mlp_all.npy"
PHY_ROWID_PATH= PHY_OUT_DIR / "row_id_clean.npy"
PHY_REPORT    = PHY_OUT_DIR / "rf_physchem_embedding_meta_report.json"

X_phys_raw = np.load(PHY_EMB_PATH)                # (N_phys, emb_dim)
rowid_phys = np.load(PHY_ROWID_PATH).astype(int)  # df 行号

y_full = df_all[LABEL_LOG].values
mask_y = np.isfinite(y_full)

keep = mask_y[rowid_phys]
X = X_phys_raw[keep]
y = y_full[rowid_phys[keep]]
groups = groups_all_full[rowid_phys[keep]]

rep = _safe_read_json(PHY_REPORT)
best_params_P = rep.get("best_params", None) if isinstance(rep, dict) else None

print("PhysChem X/y/groups:", X.shape, y.shape, groups.shape)
print("PhysChem best_params:", best_params_P)

df_phys_ad, sum_phys_ad = run_ad_5fold(
    X, y, groups,
    rf_params=best_params_P,
    k=20, coverage=0.90,
    name="PHY",
)


PhysChem X/y/groups: (3406, 64) (3406,) (3406,)
PhysChem best_params: {'max_depth': 40, 'max_features': 0.5, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 925}
[PHY] fold=1 | cov=0.802 | R2(all/in/out)=0.450/0.326/0.462
[PHY] fold=2 | cov=0.821 | R2(all/in/out)=0.417/0.398/0.336
[PHY] fold=3 | cov=0.799 | R2(all/in/out)=0.394/0.253/0.482
[PHY] fold=4 | cov=0.838 | R2(all/in/out)=0.380/0.443/-0.032
[PHY] fold=5 | cov=0.787 | R2(all/in/out)=0.553/0.481/0.528
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_PHY_per_sample.csv
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_PHY_summary.json


In [5]:
# ===== Early 融合端 AD（严格按你 EARLY_TGP 的构建方式：每折 PCA256）=====
EARLY_OUT_DIR = ROOT_MULTI / "early_fusion_TGP"
EARLY_METRICS = EARLY_OUT_DIR / "metrics_TGP_early_rf.json"

# 1) 读三端原始 embedding，并按三模态交集 + 有效 y 对齐（复刻你的 EARLY_TGP Cell 3）
SMILES_OUT_DIR = ROOT_MULTI / "SMILES" / "smiles_outputs"
TEXT_EMB_PATH  = SMILES_OUT_DIR / "reg_smiles_cls_embeddings_all.npy"
GRAPH_OUT_DIR  = ROOT_MULTI / "graph" / "graph_outputs"
GRAPH_EMB_PATH = GRAPH_OUT_DIR / "reg_graph_embeddings.npy"
GRAPH_ROWID_EMB= GRAPH_OUT_DIR / "row_id_graph_for_emb.npy"
PHY_OUT_DIR    = ROOT_MULTI / "phychem" / "physchem_mlp_rf_v2"
PHY_EMB_PATH   = PHY_OUT_DIR / "emb_physchem_mlp_all.npy"
PHY_ROWID_PATH = PHY_OUT_DIR / "row_id_clean.npy"

X_text_all = np.load(TEXT_EMB_PATH)                         # (N_df,768)
rowid_text = np.arange(len(df_all), dtype=int)

X_graph_raw = np.load(GRAPH_EMB_PATH)
rowid_graph = np.load(GRAPH_ROWID_EMB).astype(int)

X_phys_raw  = np.load(PHY_EMB_PATH)
rowid_phys  = np.load(PHY_ROWID_PATH).astype(int)

ids_text  = set(rowid_text.tolist())
ids_graph = set(rowid_graph.tolist())
ids_phys  = set(rowid_phys.tolist())
ids_inter = sorted(list(ids_text & ids_graph & ids_phys))

idx_map_graph = {rid: i for i, rid in enumerate(rowid_graph)}
idx_map_phys  = {rid: i for i, rid in enumerate(rowid_phys)}

y_full = df_all[LABEL_LOG].values

X_text_list, X_graph_list, X_phys_list, y_list, groups_list, rid_list = [], [], [], [], [], []
for rid in ids_inter:
    yv = y_full[rid]
    if not np.isfinite(yv):
        continue
    X_text_list.append(X_text_all[rid])
    X_graph_list.append(X_graph_raw[idx_map_graph[rid]])
    X_phys_list.append(X_phys_raw[idx_map_phys[rid]])
    y_list.append(yv)
    groups_list.append(groups_all_full[rid])
    rid_list.append(rid)

X_text = np.asarray(X_text_list, float)
X_graph = np.asarray(X_graph_list, float)
X_phys  = np.asarray(X_phys_list, float)
y = np.asarray(y_list, float)
groups = np.asarray(groups_list)

# 2) META（复刻你的 EARLY_TGP Cell 4）
NUM_META_COLS = [c for c in ["Duration_Value(hour)"] if c in df_all.columns]
CAT_META_COLS = [c for c in ["Effect","Endpoint"] if c in df_all.columns]

df_meta = df_all.loc[np.asarray(rid_list, int)].copy().reset_index(drop=True)

# 数值 meta
if NUM_META_COLS:
    from sklearn.preprocessing import StandardScaler
    scaler_meta = StandardScaler()
    X_num = scaler_meta.fit_transform(df_meta[NUM_META_COLS].values)
else:
    X_num = np.zeros((len(df_meta), 0), dtype=np.float32)

# 类别 meta
if CAT_META_COLS:
    from sklearn.preprocessing import OneHotEncoder
    ohe_meta = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat = ohe_meta.fit_transform(df_meta[CAT_META_COLS].astype(str))
else:
    X_cat = np.zeros((len(df_meta), 0), dtype=np.float32)

X_meta = np.concatenate([X_num, X_cat], axis=1).astype(float)

print("Early aligned:", X_text.shape, X_graph.shape, X_phys.shape, X_meta.shape, y.shape)

# 3) 读 early 最优 RF 超参
m = _safe_read_json(EARLY_METRICS)
best_params_early = m.get("best_params", None) if isinstance(m, dict) else None
print("Early best_params:", best_params_early)

# 4) 每折 PCA256 + 拼接特征
def early_fold_transform(_Xtr, _Xva, tr_idx, va_idx):
    # _Xtr/_Xva 是“暂时占位”的（我们不用它），直接用外部的 X_text/X_graph/X_phys/X_meta
    X_text_tr = X_text[tr_idx]
    X_text_va = X_text[va_idx]

    pca = PCA(n_components=256, random_state=42)
    X_text_tr_256 = pca.fit_transform(X_text_tr)
    X_text_va_256 = pca.transform(X_text_va)

    Xtr = np.concatenate([X_text_tr_256, X_graph[tr_idx], X_phys[tr_idx], X_meta[tr_idx]], axis=1)
    Xva = np.concatenate([X_text_va_256, X_graph[va_idx], X_phys[va_idx], X_meta[va_idx]], axis=1)
    return Xtr, Xva

# run_ad_5fold 需要 X 占位（只用于 split），这里传一个 dummy X（长度一致即可）
X_dummy = np.zeros((len(y), 1), dtype=float)

df_early_ad, sum_early_ad = run_ad_5fold(
    X_dummy, y, groups,
    rf_params=best_params_early,
    k=20, coverage=0.90,
    name="EARLY_TGP",
    extra_fold_transform=early_fold_transform,
)


Early aligned: (3103, 768) (3103, 256) (3103, 64) (3103, 4) (3103,)
Early best_params: {'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 924}
[EARLY_TGP] fold=1 | cov=0.804 | R2(all/in/out)=0.619/0.681/0.363
[EARLY_TGP] fold=2 | cov=0.824 | R2(all/in/out)=0.638/0.651/0.462
[EARLY_TGP] fold=3 | cov=0.815 | R2(all/in/out)=0.631/0.633/0.521
[EARLY_TGP] fold=4 | cov=0.792 | R2(all/in/out)=0.647/0.686/0.470
[EARLY_TGP] fold=5 | cov=0.744 | R2(all/in/out)=0.630/0.691/0.408
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_EARLY_TGP_per_sample.csv
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_EARLY_TGP_summary.json


In [6]:
# ===== Mid 融合端 AD（手动填 best_params_mid）=====
MID_OUT_DIR = ROOT_MULTI / "mid_fusion_TGP"

FUSED_PATH  = MID_OUT_DIR / "fused_emb_TGP_all.npy"
META_PATH   = MID_OUT_DIR / "X_meta_TGP_all.npy"
Y_PATH      = MID_OUT_DIR / "y_all_TGP.npy"
G_PATH      = MID_OUT_DIR / "groups_TGP.npy"

X_fused = np.load(FUSED_PATH)
X_meta  = np.load(META_PATH)
y       = np.load(Y_PATH).reshape(-1)
groups  = np.load(G_PATH).astype(str)

X = np.concatenate([X_fused, X_meta], axis=1)

# ✅ 这里你自己填 RF 最优超参（示例：把下面换成你 10 折随机搜索得到的 best_params）
best_params_mid = {
     "n_estimators": 291,
     "max_depth": 10,
     "min_samples_split": 5,
     "min_samples_leaf": 2,
     "max_features": 0.8,  # 推荐 sqrt / log2 / (0,1] float / None
     "bootstrap": True,
}

print("Mid X/y/groups:", X.shape, y.shape, groups.shape)
print("Mid best_params_mid:", best_params_mid if best_params_mid else "(None -> 用默认RF参数)")

df_mid_ad, sum_mid_ad = run_ad_5fold(
    X, y, groups,
    rf_params=best_params_mid if best_params_mid else None,  # 你不填就走默认
    k=20, coverage=0.90,
    name="MID_TGP",
)


Mid X/y/groups: (3103, 260) (3103,) (3103,)
Mid best_params_mid: {'n_estimators': 291, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 0.8, 'bootstrap': True}
[MID_TGP] fold=1 | cov=0.845 | R2(all/in/out)=0.786/0.793/0.546
[MID_TGP] fold=2 | cov=0.870 | R2(all/in/out)=0.760/0.750/0.533
[MID_TGP] fold=3 | cov=0.776 | R2(all/in/out)=0.796/0.766/0.684
[MID_TGP] fold=4 | cov=0.769 | R2(all/in/out)=0.806/0.800/0.701
[MID_TGP] fold=5 | cov=0.747 | R2(all/in/out)=0.800/0.745/0.674
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_MID_TGP_per_sample.csv
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_MID_TGP_summary.json


In [7]:
# ===== Late 融合端 AD =====
LATE_OUT_DIR = ROOT_MULTI / "late_fusion_TGP"

Xtr_path = LATE_OUT_DIR / "late_TGP_emb_train.npy"
Xte_path = LATE_OUT_DIR / "late_TGP_emb_test.npy"
ytr_path = LATE_OUT_DIR / "y_train_TGP_late.npy"
yte_path = LATE_OUT_DIR / "y_test_TGP_late.npy"
idxtr_path= LATE_OUT_DIR / "late_TGP_train_idx.npy"   # df 行号
idxte_path= LATE_OUT_DIR / "late_TGP_test_idx.npy"
metrics_path = LATE_OUT_DIR / "metrics_TGP_late_meta_rf.json"

X_tr = np.load(Xtr_path)
X_te = np.load(Xte_path)
y_tr = np.load(ytr_path).reshape(-1)
y_te = np.load(yte_path).reshape(-1)
idx_tr = np.load(idxtr_path).astype(int)
idx_te = np.load(idxte_path).astype(int)

# 拼成全量（仅在“三端交集”样本子集上）
X = np.concatenate([X_tr, X_te], axis=0)
y = np.concatenate([y_tr, y_te], axis=0)
rowid = np.concatenate([idx_tr, idx_te], axis=0)

groups = df_all.loc[rowid, SMILES_COL].astype(str).values

m = _safe_read_json(metrics_path)
best_params_meta = m.get("best_params_meta", None) if isinstance(m, dict) else None

print("Late X/y/groups:", X.shape, y.shape, groups.shape)
print("Late best_params_meta:", best_params_meta)

df_late_ad, sum_late_ad = run_ad_5fold(
    X, y, groups,
    rf_params=best_params_meta,
    k=20, coverage=0.90,
    name="LATE_TGP",
)


Late X/y/groups: (1637, 3) (1637,) (1637,)
Late best_params_meta: {'max_depth': 5, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 976}
[LATE_TGP] fold=1 | cov=0.845 | R2(all/in/out)=0.491/0.417/0.495
[LATE_TGP] fold=2 | cov=0.793 | R2(all/in/out)=0.517/0.462/0.537
[LATE_TGP] fold=3 | cov=0.829 | R2(all/in/out)=0.640/0.541/0.735
[LATE_TGP] fold=4 | cov=0.832 | R2(all/in/out)=0.563/0.521/0.551
[LATE_TGP] fold=5 | cov=0.862 | R2(all/in/out)=0.582/0.504/0.675
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_LATE_TGP_per_sample.csv
✅ saved: /root/Invertebrates_EC50_multi_fusion/AD_6mods_TopkADSAL/AD_LATE_TGP_summary.json
