In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle, json, random

from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ========= 路径 & 列名，根据你实际情况改 =========
DATA_PATH = Path("/root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx")  # 原始 + 理化性质 Excel
OUT_DIR   = Path("/root/Invertebrates_EC50_multi_fusion/phychem/physchem_mlp_rf_v2")
OUT_DIR.mkdir(parents=True, exist_ok=True)

SMILES_COL     = "SMILES_Canonical_RDKit"  # SMILES 列
ROWID_COL      = "row_id"                  # 若没有，会自动创建
RAW_TARGET_COL = "mgperL"                  # Excel 里原始浓度列（mg/L）

# 理化性质特征前缀
PHYS_PREFIXES = ["DESC_", "KIER_", "ESTATE_"]

# meta 特征（按你自己的列名改）
NUM_META_COLS_CANDIDATE = ["Duration_Value(hour)"]   # 数值型 meta
CAT_META_COLS_CANDIDATE = ["Effect", "Endpoint"]     # 分类型 meta

# MLP（提 embedding）划分参数
MLP_TRAIN_SIZE = 0.8
MLP_SPLIT_SEED = 2025

# RF 划分参数（embedding + meta → RF）
RF_TRAIN_SIZE  = 0.8
RF_SPLIT_SEED  = 2024

# RF 超参随机搜索空间
RF_PARAM_DISTS = {
    "n_estimators":      randint(200, 1001),
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf":  randint(1, 5),
    "max_features":      ["sqrt", "log2", 0.5, 0.8],
}
RF_N_ITER = 30

# MLP 超参
MLP_HIDDEN_DIM   = 128
MLP_EMB_DIM      = 64
MLP_DROPOUT      = 0.2
MLP_EPOCHS       = 80
MLP_BATCH_SIZE   = 128
MLP_LR           = 3e-3
MLP_WEIGHT_DECAY = 1e-5

# 随机种子
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用设备:", device)
print("数据路径:", DATA_PATH)
print("输出目录:", OUT_DIR)


使用设备: cuda
数据路径: /root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx
输出目录: /root/Invertebrates_EC50_multi_fusion/phychem/physchem_mlp_rf_v2


In [2]:
# 1) 读 Excel
df = pd.read_excel(DATA_PATH)
print("原始数据形状:", df.shape)
print("前 20 列:", df.columns.tolist()[:20])

# 2) 确保 row_id 存在
if ROWID_COL not in df.columns:
    df[ROWID_COL] = np.arange(len(df), dtype=int)
    print(f"未发现 {ROWID_COL}，已按顺序生成。")
else:
    df[ROWID_COL] = df[ROWID_COL].astype(int)

# 3) 检查关键列
assert SMILES_COL in df.columns, f"缺少 SMILES 列: {SMILES_COL}"
assert RAW_TARGET_COL in df.columns, f"缺少原始目标列: {RAW_TARGET_COL}"

# 4) 构造 log10(mg/L)
df[RAW_TARGET_COL] = df[RAW_TARGET_COL].astype(float)
mask_positive = df[RAW_TARGET_COL] > 0
if (~mask_positive).sum() > 0:
    print(f"⚠ 有 {(~mask_positive).sum()} 条 mg/L <= 0，无法取 log10，将被丢弃。")

df = df[mask_positive].reset_index(drop=True)
df["mgperL_log"] = np.log10(df[RAW_TARGET_COL])
print("构造 mgperL_log 完成。")

# 5) 找理化特征列
phys_cols = [c for c in df.columns
             if any(c.startswith(pref) for pref in PHYS_PREFIXES)]
print(f"找到 {len(phys_cols)} 个理化特征:", phys_cols[:10], "...")

# 6) 确定 meta 列
num_meta_cols = [c for c in NUM_META_COLS_CANDIDATE if c in df.columns]
cat_meta_cols = [c for c in CAT_META_COLS_CANDIDATE if c in df.columns]

print("✅ 数值 meta 列:", num_meta_cols)
print("✅ 分类 meta 列:", cat_meta_cols)

# 7) 丢掉所有有缺失的行（理化 + meta + SMILES + 目标）
required_cols = phys_cols + num_meta_cols + cat_meta_cols + [SMILES_COL, RAW_TARGET_COL, "mgperL_log"]
df_clean = df.dropna(subset=required_cols).reset_index(drop=True)
print("丢掉缺失后形状:", df_clean.shape)

# 8) 准备基础数组
row_id_clean = df_clean[ROWID_COL].values
groups_clean = df_clean[SMILES_COL].astype(str).values
y_raw_clean  = df_clean[RAW_TARGET_COL].astype(float).values
y_log_clean  = df_clean["mgperL_log"].astype(float).values

X_phys = df_clean[phys_cols].astype(float).values
print("X_phys 形状:", X_phys.shape, "y_log_clean 形状:", y_log_clean.shape)


原始数据形状: (3620, 34)
前 20 列: ['SMILES_Canonical_RDKit', 'Duration_Value(hour)', 'Effect', 'Endpoint', 'mgperL', 'Species Group', 'ChemicalName', 'CAS', 'CanonicalSMILES', 'database', 'DESC_MolWt', 'DESC_ExactMolWt', 'DESC_HeavyAtomCount', 'DESC_RingCount', 'DESC_NumAromaticRings', 'DESC_FractionCSP3', 'DESC_MolLogP', 'DESC_TPSA', 'DESC_ASA_Labute', 'DESC_HBA']
未发现 row_id，已按顺序生成。
构造 mgperL_log 完成。
找到 24 个理化特征: ['DESC_MolWt', 'DESC_ExactMolWt', 'DESC_HeavyAtomCount', 'DESC_RingCount', 'DESC_NumAromaticRings', 'DESC_FractionCSP3', 'DESC_MolLogP', 'DESC_TPSA', 'DESC_ASA_Labute', 'DESC_HBA'] ...
✅ 数值 meta 列: ['Duration_Value(hour)']
✅ 分类 meta 列: ['Effect', 'Endpoint']
丢掉缺失后形状: (3408, 36)
X_phys 形状: (3408, 24) y_log_clean 形状: (3408,)


In [3]:
# ====== 新增：进一步丢弃包含 inf / -inf / 超大值的行 ======
phys_mat = df_clean[phys_cols].astype(float).to_numpy()

# 行级别：所有理化特征都必须是有限数值（不是 NaN，不是 inf）
mask_finite = np.isfinite(phys_mat).all(axis=1)

n_bad = (~mask_finite).sum()
print(f"包含 inf 或超大值的行数: {n_bad}")

if n_bad > 0:
    df_clean = df_clean[mask_finite].reset_index(drop=True)
    print("进一步清洗后形状:", df_clean.shape)

    # 重新构造基础数组
    row_id_clean = df_clean[ROWID_COL].values
    groups_clean = df_clean[SMILES_COL].astype(str).values
    y_raw_clean  = df_clean[RAW_TARGET_COL].astype(float).values
    y_log_clean  = df_clean["mgperL_log"].astype(float).values
    X_phys       = df_clean[phys_cols].astype(float).values

print("最终 X_phys 形状:", X_phys.shape, "y_log_clean 形状:", y_log_clean.shape)


包含 inf 或超大值的行数: 2
进一步清洗后形状: (3406, 36)
最终 X_phys 形状: (3406, 24) y_log_clean 形状: (3406,)


In [4]:
class PhysChemDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float().view(-1, 1)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class PhysChemMLP(nn.Module):
    """X_phys → hidden → emb → y_pred"""
    def __init__(self, in_dim, hidden_dim=128, emb_dim=64, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc_emb = nn.Linear(hidden_dim, emb_dim)
        self.fc_out = nn.Linear(emb_dim, 1)

    def forward(self, x, return_emb=False):
        h = self.act(self.fc1(x))
        h = self.dropout(h)
        emb = self.act(self.fc_emb(h))
        out = self.fc_out(emb)
        if return_emb:
            return out, emb
        return out


In [5]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler

gss_mlp = GroupShuffleSplit(
    n_splits=1,
    train_size=MLP_TRAIN_SIZE,
    random_state=MLP_SPLIT_SEED,
)

train_idx_mlp, val_idx_mlp = next(gss_mlp.split(X_phys, y_log_clean, groups_clean))
print(f"MLP 划分: train={len(train_idx_mlp)}, val={len(val_idx_mlp)}")

X_phys_train = X_phys[train_idx_mlp]
X_phys_val   = X_phys[val_idx_mlp]
y_train_mlp  = y_log_clean[train_idx_mlp]
y_val_mlp    = y_log_clean[val_idx_mlp]

# 标准化（只用 MLP 的 train 拟合）
scaler_mlp = StandardScaler()
X_train_mlp_scaled = scaler_mlp.fit_transform(X_phys_train)
X_val_mlp_scaled   = scaler_mlp.transform(X_phys_val)
X_all_mlp_scaled   = scaler_mlp.transform(X_phys)

print("MLP: X_train / X_val / X_all:", X_train_mlp_scaled.shape, X_val_mlp_scaled.shape, X_all_mlp_scaled.shape)

with open(OUT_DIR / "scaler_physchem_mlp.pkl", "wb") as f:
    pickle.dump(scaler_mlp, f)
print("✅ 已保存 scaler_physchem_mlp.pkl")


MLP 划分: train=2716, val=690
MLP: X_train / X_val / X_all: (2716, 24) (690, 24) (3406, 24)
✅ 已保存 scaler_physchem_mlp.pkl


In [6]:
LOSS_FN = nn.L1Loss()

train_ds = PhysChemDataset(X_train_mlp_scaled, y_train_mlp)
val_ds   = PhysChemDataset(X_val_mlp_scaled,   y_val_mlp)
all_ds   = PhysChemDataset(X_all_mlp_scaled,   y_log_clean)

train_loader = DataLoader(train_ds, batch_size=MLP_BATCH_SIZE, shuffle=True,  drop_last=False)
val_loader   = DataLoader(val_ds,   batch_size=MLP_BATCH_SIZE, shuffle=False, drop_last=False)
all_loader   = DataLoader(all_ds,   batch_size=MLP_BATCH_SIZE, shuffle=False, drop_last=False)

model_mlp = PhysChemMLP(
    in_dim=X_train_mlp_scaled.shape[1],
    hidden_dim=MLP_HIDDEN_DIM,
    emb_dim=MLP_EMB_DIM,
    dropout=MLP_DROPOUT,
).to(device)

optimizer = torch.optim.AdamW(
    model_mlp.parameters(),
    lr=MLP_LR,
    weight_decay=MLP_WEIGHT_DECAY,
)

best_val_loss = np.inf
best_state = None

print("\n===== 训练理化端 MLP（只为提 embedding）=====")
for epoch in range(1, MLP_EPOCHS + 1):
    model_mlp.train()
    train_losses = []
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model_mlp(xb)
        loss = LOSS_FN(pred, yb)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    model_mlp.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model_mlp(xb)
            loss = LOSS_FN(pred, yb)
            val_losses.append(loss.item())

    mean_tr = float(np.mean(train_losses))
    mean_va = float(np.mean(val_losses))
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | train_loss={mean_tr:.4f} | val_loss={mean_va:.4f}")

    if mean_va < best_val_loss:
        best_val_loss = mean_va
        best_state = {k: v.cpu().clone() for k, v in model_mlp.state_dict().items()}

print("MLP 最优 val_loss:", best_val_loss)

if best_state is not None:
    model_mlp.load_state_dict(best_state)
model_mlp.to(device)
model_mlp.eval()

emb_list = []
with torch.no_grad():
    for xb, yb in all_loader:
        xb = xb.to(device)
        _, emb = model_mlp(xb, return_emb=True)
        emb_list.append(emb.cpu().numpy())

emb_all = np.concatenate(emb_list, axis=0)  # (N_clean, emb_dim)
print("emb_all 形状:", emb_all.shape)

# 保存 embedding + row_id
np.save(OUT_DIR / "emb_physchem_mlp_all.npy", emb_all)
np.save(OUT_DIR / "row_id_clean.npy", row_id_clean)

emb_cols = [f"emb_phys_{i}" for i in range(MLP_EMB_DIM)]
df_emb = pd.DataFrame(emb_all, columns=emb_cols)
df_emb[ROWID_COL] = row_id_clean
df_emb.to_parquet(OUT_DIR / "emb_physchem_mlp_all_with_rowid.parquet", index=False, engine="fastparquet",)

print("✅ 已保存 MLP embedding 及 row_id 对齐信息")



===== 训练理化端 MLP（只为提 embedding）=====
Epoch 001 | train_loss=0.9479 | val_loss=0.9013
Epoch 010 | train_loss=0.7597 | val_loss=0.8005
Epoch 020 | train_loss=0.6998 | val_loss=0.7856
Epoch 030 | train_loss=0.6496 | val_loss=0.7632
Epoch 040 | train_loss=0.6273 | val_loss=0.7660
Epoch 050 | train_loss=0.6115 | val_loss=0.7587
Epoch 060 | train_loss=0.6119 | val_loss=0.7610
Epoch 070 | train_loss=0.5941 | val_loss=0.7519
Epoch 080 | train_loss=0.5740 | val_loss=0.7533
MLP 最优 val_loss: 0.7463072737058004
emb_all 形状: (3406, 64)
✅ 已保存 MLP embedding 及 row_id 对齐信息


In [7]:
# 数值 meta
if num_meta_cols:
    X_meta_num = df_clean[num_meta_cols].astype(float).values
else:
    X_meta_num = np.zeros((len(df_clean), 0), dtype=float)

# 分类 meta → OneHot
if cat_meta_cols:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    X_meta_cat = ohe.fit_transform(df_clean[cat_meta_cols].astype(str))
    with open(OUT_DIR / "ohe_physchem_meta.pkl", "wb") as f:
        pickle.dump(ohe, f)
else:
    X_meta_cat = np.zeros((len(df_clean), 0), dtype=float)

X_meta_all = np.concatenate([X_meta_num, X_meta_cat], axis=1)
print("X_meta_all 形状:", X_meta_all.shape)

# RF 输入：embedding + meta
X_rf_all = np.concatenate([emb_all, X_meta_all], axis=1)
y_rf_log  = y_log_clean.copy()   # RF 在 log10(mg/L) 空间拟合
y_rf_raw  = y_raw_clean.copy()
groups_rf = groups_clean.copy()

print("X_rf_all 形状:", X_rf_all.shape, "| y_rf_log 形状:", y_rf_log.shape)


X_meta_all 形状: (3406, 4)
X_rf_all 形状: (3406, 68) | y_rf_log 形状: (3406,)


In [8]:
gss_rf = GroupShuffleSplit(
    n_splits=1,
    train_size=RF_TRAIN_SIZE,
    random_state=RF_SPLIT_SEED,
)

train_idx_rf, val_idx_rf = next(gss_rf.split(X_rf_all, y_rf_log, groups_rf))
print(f"RF 划分: train={len(train_idx_rf)}, val={len(val_idx_rf)}")

train_row_id_rf = row_id_clean[train_idx_rf]
val_row_id_rf   = row_id_clean[val_idx_rf]

np.save(OUT_DIR / "rf_train_idx.npy", train_idx_rf)
np.save(OUT_DIR / "rf_val_idx.npy",   val_idx_rf)
np.save(OUT_DIR / "rf_train_row_id.npy", train_row_id_rf)
np.save(OUT_DIR / "rf_val_row_id.npy",   val_row_id_rf)

print("✅ 已保存 RF train/val 索引及对应 row_id（这是你后面融合时对齐用的关键）")


RF 划分: train=2690, val=716
✅ 已保存 RF train/val 索引及对应 row_id（这是你后面融合时对齐用的关键）


In [9]:
X_train_rf = X_rf_all[train_idx_rf]
y_train_rf = y_rf_log[train_idx_rf]
groups_train_rf = groups_rf[train_idx_rf]

X_val_rf = X_rf_all[val_idx_rf]
y_val_rf = y_rf_log[val_idx_rf]

print("RF train 形状:", X_train_rf.shape, "| val 形状:", X_val_rf.shape)

gkf_rf = GroupKFold(n_splits=10)
cv_splitter_rf = list(gkf_rf.split(X_train_rf, y_train_rf, groups_train_rf))

rf_base = RandomForestRegressor(
    random_state=42,
    n_jobs=-1,
)

print("\n===== RF：在 train 内做十折 RandomizedSearchCV（log10 空间）=====")
search_rf = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=RF_PARAM_DISTS,
    n_iter=RF_N_ITER,
    scoring="r2",
    cv=cv_splitter_rf,
    n_jobs=-1,
    verbose=1,
    random_state=42,
    refit=True,  # 自动在 full train 上用最优超参 refit
)

search_rf.fit(X_train_rf, y_train_rf)

best_rf        = search_rf.best_estimator_
best_params    = search_rf.best_params_
best_cv_score  = search_rf.best_score_

print("\nRF 最优参数:", best_params)
print("RF train 十折 CV 最佳 R2:", best_cv_score)


RF train 形状: (2690, 68) | val 形状: (716, 68)

===== RF：在 train 内做十折 RandomizedSearchCV（log10 空间）=====
Fitting 10 folds for each of 30 candidates, totalling 300 fits

RF 最优参数: {'max_depth': 40, 'max_features': 0.5, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 925}
RF train 十折 CV 最佳 R2: 0.46369468628107613


In [10]:
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

def compute_metrics(y_true, y_pred):
    r2   = r2_score(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    mse  = mean_squared_error(y_true, y_pred)   # 老版本没有 squared 参数
    rmse = np.sqrt(mse)
    R    = pearsonr(y_true, y_pred)[0]
    return {"R2": r2, "MAE": mae, "RMSE": rmse, "R": R}

# log10 空间的预测
y_train_pred_log = best_rf.predict(X_train_rf)
y_val_pred_log   = best_rf.predict(X_val_rf)

# 对应的原始 mg/L 空间预测（10^log）
y_train_true_raw = y_rf_raw[train_idx_rf]
y_val_true_raw   = y_rf_raw[val_idx_rf]

y_train_pred_raw = np.power(10.0, y_train_pred_log)
y_val_pred_raw   = np.power(10.0, y_val_pred_log)

metrics_train_log = compute_metrics(y_train_rf, y_train_pred_log)
metrics_val_log   = compute_metrics(y_val_rf,   y_val_pred_log)

metrics_train_raw = compute_metrics(y_train_true_raw, y_train_pred_raw)
metrics_val_raw   = compute_metrics(y_val_true_raw,   y_val_pred_raw)

print("\n===== RF 指标（log10 空间）=====")
print("Train:", metrics_train_log)
print("Val  :", metrics_val_log)

print("\n===== RF 指标（原始 mg/L 空间）=====")
print("Train:", metrics_train_raw)
print("Val  :", metrics_val_raw)



===== RF 指标（log10 空间）=====
Train: {'R2': 0.8388686189688648, 'MAE': 0.34010284369922916, 'RMSE': np.float64(0.48095303429286074), 'R': np.float64(0.9259615310884322)}
Val  : {'R2': 0.5035816070048983, 'MAE': 0.6238301340682681, 'RMSE': np.float64(0.8299235808857305), 'R': np.float64(0.7161049219772953)}

===== RF 指标（原始 mg/L 空间）=====
Train: {'R2': 0.4772390995645611, 'MAE': 24.44926516624776, 'RMSE': np.float64(60.37092696499071), 'R': np.float64(0.7995182228408871)}
Val  : {'R2': 0.06519466478336722, 'MAE': 39.15034364449804, 'RMSE': np.float64(85.01951089073852), 'R': np.float64(0.47575130501813967)}


In [11]:
# 1) 保存预测数组（log 空间）
np.save(OUT_DIR / "rf_y_train_pred_log.npy", y_train_pred_log)
np.save(OUT_DIR / "rf_y_val_pred_log.npy",   y_val_pred_log)

# 2) 保存预测数组（原始 mg/L 空间）
np.save(OUT_DIR / "rf_y_train_pred_raw.npy", y_train_pred_raw)
np.save(OUT_DIR / "rf_y_val_pred_raw.npy",   y_val_pred_raw)

# 3) 带 row_id 的汇总表（每一行都有：split, row_id, y_true_log/raw, y_pred_log/raw）
df_rf_all = pd.DataFrame({
    ROWID_COL: row_id_clean,
    "split_rf": "train",                 # 先全填成 train，下面再改 val
    "y_true_log": y_log_clean,
    "y_true_raw": y_raw_clean,
    "y_pred_log": np.nan,
    "y_pred_raw": np.nan,
})

# 给 train/val 填入对应预测
df_rf_all.loc[train_idx_rf, "y_pred_log"] = y_train_pred_log
df_rf_all.loc[val_idx_rf,   "y_pred_log"] = y_val_pred_log
df_rf_all.loc[val_idx_rf,   "split_rf"]   = "val"

df_rf_all.loc[train_idx_rf, "y_pred_raw"] = y_train_pred_raw
df_rf_all.loc[val_idx_rf,   "y_pred_raw"] = y_val_pred_raw

# 保存 parquet
df_rf_all.to_parquet(OUT_DIR / "rf_results_with_rowid.parquet", index=False, engine="fastparquet",)
print("✅ 已保存 RF 全部结果 + 索引 到 rf_results_with_rowid.parquet")

# 4) 保存 RF 模型（可选）
import joblib
joblib.dump(best_rf, OUT_DIR / "rf_physchem_embedding_meta_best.joblib")
print("✅ 已保存 RF 模型 rf_physchem_embedding_meta_best.joblib")

# 5) 保存 RF 指标 & 最优超参（log/raw 两种空间）
rf_report = {
    "best_params": {k: (int(v) if isinstance(v, (np.integer,)) else v)
                    for k, v in best_params.items()},
    "best_cv_r2_log": float(best_cv_score),
    "metrics_train_log": {k: float(v) for k, v in metrics_train_log.items()},
    "metrics_val_log":   {k: float(v) for k, v in metrics_val_log.items()},
    "metrics_train_raw": {k: float(v) for k, v in metrics_train_raw.items()},
    "metrics_val_raw":   {k: float(v) for k, v in metrics_val_raw.items()},
    "n_train_rf": int(len(train_idx_rf)),
    "n_val_rf":   int(len(val_idx_rf)),
}

with open(OUT_DIR / "rf_physchem_embedding_meta_report.json", "w", encoding="utf-8") as f:
    json.dump(rf_report, f, indent=2, ensure_ascii=False)

print("✅ RF 报告已保存到 rf_physchem_embedding_meta_report.json")
print("✅ RF train/val 索引 & row_id 已保存为 numpy（rf_train_idx.npy 等）")
print("✅ MLP embedding 全量 & 对齐 row_id 已保存为 emb_physchem_mlp_all.npy + row_id_clean.npy")


✅ 已保存 RF 全部结果 + 索引 到 rf_results_with_rowid.parquet
✅ 已保存 RF 模型 rf_physchem_embedding_meta_best.joblib
✅ RF 报告已保存到 rf_physchem_embedding_meta_report.json
✅ RF train/val 索引 & row_id 已保存为 numpy（rf_train_idx.npy 等）
✅ MLP embedding 全量 & 对齐 row_id 已保存为 emb_physchem_mlp_all.npy + row_id_clean.npy


In [12]:
# ====================================================
# 额外导出给「后期融合」用的一套 PhysChem RF 文件
# 命名严格对齐 text / graph 端
# ====================================================

# 1) label（log10(mg/L) 空间）
np.save(OUT_DIR / "rf_phys_y_train.npy", y_train_rf)   # shape = (N_train,)
np.save(OUT_DIR / "rf_phys_y_test.npy",  y_val_rf)     # shape = (N_val,)

# 2) train / test 的 row 索引（在 df_clean 里的行号）
np.save(OUT_DIR / "rf_phys_train_idx.npy", train_idx_rf)
np.save(OUT_DIR / "rf_phys_test_idx.npy",  val_idx_rf)

# 3) 预测值（这里**暂时**把 train 上的拟合结果当成 “伪 OOF”，val 当成 test）
#    如果以后你想要真正的 OOF，就像 text/graph 那样再写一段 10 折循环覆盖它们即可
np.save(OUT_DIR / "rf_phys_oof_pred_train.npy", y_train_pred_log)
np.save(OUT_DIR / "rf_phys_y_pred_test.npy",    y_val_pred_log)

print("✅ 已额外导出后期融合需要的 PhysChem 文件：")
print("   rf_phys_y_train.npy / rf_phys_y_test.npy")
print("   rf_phys_oof_pred_train.npy / rf_phys_y_pred_test.npy")
print("   rf_phys_train_idx.npy / rf_phys_test_idx.npy")


✅ 已额外导出后期融合需要的 PhysChem 文件：
   rf_phys_y_train.npy / rf_phys_y_test.npy
   rf_phys_oof_pred_train.npy / rf_phys_y_pred_test.npy
   rf_phys_train_idx.npy / rf_phys_test_idx.npy
