In [1]:
import os
import math
import json
from pathlib import Path

import numpy as np
import pandas as pd

from rdkit import Chem
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, randint

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR

from transformers import RobertaModel, RobertaTokenizer

# ========== 随机种子 ==========
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ========== 路径配置（按你之前 notebook）==========
DATA_PATH   = "/root/fusion_dataset/Invertebrates_EC50_unique.xlsx"  # 原始数据
MODEL_DIR   = "/root/多模态/model"                          # 本地 ChemBERTa 模型目录
SMILES_OUT_DIR = Path("/root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs")
SMILES_OUT_DIR.mkdir(parents=True, exist_ok=True)

# 统一嵌入输出路径（全体样本 CLS）
EMB_ALL_PATH = SMILES_OUT_DIR / "reg_smiles_cls_embeddings_all.npy"

print("SMILES 输出目录:", SMILES_OUT_DIR)


SMILES 输出目录: /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs


In [2]:
# 这里的 MODEL_DIR 要跟你保存 ChemBERTa 模型的路径一致
tokenizer = RobertaTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [3]:
import math

# 1. 读取数据
df = pd.read_excel(DATA_PATH, engine="openpyxl")

# 2. 必要列检查
required_cols = [
    "SMILES_Canonical_RDKit",
    "mgperL",
]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"原始表缺少必要列: {missing}")

# 3. 计算 log10 毒性（过滤掉 mgperL <= 0 的异常）
df = df.copy()
df["mgperL_log"] = df["mgperL"].apply(lambda x: math.log10(x) if x is not None and x > 0 else np.nan)
df = df.dropna(subset=["mgperL_log"]).reset_index(drop=True)

# 4. row_id（方便将来对齐，如果需要）
df["row_id"] = df.index

print("预处理后的 df 前 5 行：")
print(df.head())
print("样本总数:", len(df))


预处理后的 df 前 5 行：
     SMILES_Canonical_RDKit  Duration_Value(hour) Effect Endpoint  mgperL  \
0        [Cl-].[Cl-].[Zn+2]                  96.0    ITX     EC50     1.3   
1  O=S(=O)([O-])[O-].[Zn+2]                  24.0    ITX     EC50     2.5   
2        [Cl-].[Cl-].[Pb+2]                  96.0    ITX     EC50    40.8   
3  O=S(=O)([O-])[O-].[Cu+2]                  24.0    ITX     EC50     1.9   
4  O=S(=O)([O-])[O-].[Cu+2]                  96.0    ITX     EC50     0.6   

   Species Group                         ChemicalName        CAS  \
0  Invertebrates                Zinc chloride (ZnCl2)  7646-85-7   
1  Invertebrates       Sulfuric acid, Zinc salt (1:1)  7733-02-0   
2  Invertebrates                Lead chloride (PbCl2)  7758-95-4   
3  Invertebrates  Sulfuric acid copper(2+) salt (1:1)  7758-98-7   
4  Invertebrates  Sulfuric acid copper(2+) salt (1:1)  7758-98-7   

            CanonicalSMILES database  mgperL_log  row_id  
0        [Cl-].[Cl-].[Zn+2]   ECOTOX    0.113943     

In [4]:
# ===== 普通 Dataset：用于提取嵌入（预编码 tokenizer，推理快）=====
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, smiles, targets, tokenizer, max_length=512):
        self.smiles = list(smiles)
        self.targets = np.array(targets, dtype=np.float32)
        self.tokenizer = tokenizer
        self.encodings = self.tokenizer(
            self.smiles,
            truncation=True,
            padding=True,
            max_length=max_length,
        )

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long)
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long)
        label = torch.tensor(self.targets[idx], dtype=torch.float32)
        return input_ids, attention_mask, label


# ===== 含随机 SMILES 的增强版 Dataset：仅用于 BERT 训练 =====
class SMILESDatasetAug(torch.utils.data.Dataset):
    """
    - augment=True: 每次 __getitem__ 时随机生成一个 random SMILES（如果 mol 可以转）
    - augment=False: 使用 canonical SMILES
    """
    def __init__(self, smiles, targets, tokenizer, max_length=512, augment=False):
        self.smiles = list(smiles)   # canonical SMILES
        self.targets = np.array(targets, dtype=np.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment

        # 预先转成 RDKit Mol
        self.mols = []
        for s in self.smiles:
            try:
                mol = Chem.MolFromSmiles(s)
            except Exception:
                mol = None
            self.mols.append(mol)

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        # 1) 生成一个 SMILES 序列
        mol = self.mols[idx]
        if self.augment and (mol is not None):
            # 随机 SMILES
            smiles_str = Chem.MolToSmiles(mol, doRandom=True)
        else:
            smiles_str = self.smiles[idx]

        # 2) tokenizer 编码
        encoding = self.tokenizer(
            smiles_str,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
        )

        input_ids = torch.tensor(encoding["input_ids"], dtype=torch.long)
        attention_mask = torch.tensor(encoding["attention_mask"], dtype=torch.long)
        label = torch.tensor(self.targets[idx], dtype=torch.float32)

        return input_ids, attention_mask, label


In [5]:
# 3. 文本端模型定义：ChemBERTa 回归器 + CLS 嵌入提取接口（768 维）

class ChemBERTaRegressor(nn.Module):
    """
    ChemBERTa 回归模型：
    - backbone: 预训练 Roberta/ChemBERTa
    - head: 基于 [CLS] 向量的一层 Linear 回归头
    - get_cls_embedding: 返回 CLS 作为下游 RF / 融合的输入特征
    """
    def __init__(
        self,
        model_name_or_path,
        freeze_embeddings: bool = True,
        freeze_n_layers: int = 0,
        dropout: float = 0.3,
    ):
        super().__init__()
        self.backbone = RobertaModel.from_pretrained(
            model_name_or_path,
            local_files_only=True,
        )
        hidden_size = self.backbone.config.hidden_size

        self.dropout = nn.Dropout(dropout)
        self.reg_head = nn.Linear(hidden_size, 1)

        # 冻结 embedding 层
        if freeze_embeddings:
            for p in self.backbone.embeddings.parameters():
                p.requires_grad = False

        # 冻结前 freeze_n_layers 个 encoder layer
        if freeze_n_layers > 0:
            encoder_layers = self.backbone.encoder.layer
            for layer in encoder_layers[:freeze_n_layers]:
                for p in layer.parameters():
                    p.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        cls = outputs.last_hidden_state[:, 0, :]  # [CLS]
        x = self.dropout(cls)
        x = self.reg_head(x).squeeze(-1)          # (B,)
        return x

    def get_cls_embedding(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.backbone(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            cls = outputs.last_hidden_state[:, 0, :]
        return cls


In [6]:
# ========= 4. 文本端：基于 SMILES 分组做 8:2 划分，仅用于 BERT 训练 =========

smiles_all  = df["SMILES_Canonical_RDKit"].astype(str).tolist()
targets_all = df["mgperL_log"].values
groups_all  = df["SMILES_Canonical_RDKit"].astype(str).values  # 分组依据

gss_bert = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx_bert, val_idx_bert = next(gss_bert.split(smiles_all, targets_all, groups=groups_all))

smiles_train = [smiles_all[i] for i in train_idx_bert]
smiles_val   = [smiles_all[i] for i in val_idx_bert]
y_train_bert = targets_all[train_idx_bert]
y_val_bert   = targets_all[val_idx_bert]

print("BERT 训练集样本数:", len(smiles_train))
print("BERT 验证/测试集样本数:", len(smiles_val))

# ===== 构建 Dataset / DataLoader =====
ENC_BATCH_SIZE = 64
MAX_EPOCHS     = 100
PATIENCE       = 10
MAX_LEN        = 512

train_enc_dataset = SMILESDatasetAug(
    smiles_train, y_train_bert, tokenizer,
    max_length=MAX_LEN, augment=False,  # train 做增强
)
val_enc_dataset = SMILESDatasetAug(
    smiles_val, y_val_bert, tokenizer,
    max_length=MAX_LEN, augment=False,  # val 不增强
)

train_enc_loader = torch.utils.data.DataLoader(
    train_enc_dataset, batch_size=ENC_BATCH_SIZE,
    shuffle=True, drop_last=False,
)
val_enc_loader = torch.utils.data.DataLoader(
    val_enc_dataset, batch_size=ENC_BATCH_SIZE,
    shuffle=False, drop_last=False,
)

print("Train steps/epoch:", math.ceil(len(train_enc_dataset) / ENC_BATCH_SIZE))


BERT 训练集样本数: 2889
BERT 验证/测试集样本数: 731


[00:26:15] SMILES Parse Error: syntax error while parsing: [Cl]|[Sn](|[Cl])(|[Cl])CCCC
[00:26:15] SMILES Parse Error: check for mistakes around position 5:
[00:26:15] [Cl]|[Sn](|[Cl])(|[Cl])CCCC
[00:26:15] ~~~~^
[00:26:15] SMILES Parse Error: Failed parsing SMILES '[Cl]|[Sn](|[Cl])(|[Cl])CCCC' for input: '[Cl]|[Sn](|[Cl])(|[Cl])CCCC'
[00:26:15] SMILES Parse Error: syntax error while parsing: CCCCCCCCCCCC[S]|[Sn](|[S]CCCCCCCCCCCC)(CCCC)CCCC
[00:26:15] SMILES Parse Error: check for mistakes around position 16:
[00:26:15] CCCCCCCCCCCC[S]|[Sn](|[S]CCCCCCCCCCCC)(CC
[00:26:15] ~~~~~~~~~~~~~~~^
[00:26:15] SMILES Parse Error: Failed parsing SMILES 'CCCCCCCCCCCC[S]|[Sn](|[S]CCCCCCCCCCCC)(CCCC)CCCC' for input: 'CCCCCCCCCCCC[S]|[Sn](|[S]CCCCCCCCCCCC)(CCCC)CCCC'
[00:26:15] Explicit valence for atom # 0 O, 2, is greater than permitted
[00:26:15] Explicit valence for atom # 0 O, 2, is greater than permitted
[00:26:15] Explicit valence for atom # 0 O, 2, is greater than permitted
[00:26:15] Explicit 

Train steps/epoch: 46


[00:26:15] SMILES Parse Error: syntax error while parsing: O|[Hg]c1c([O-])c(Br)cc2c1Oc3cc([O-])c(Br)cc3C24OC(=O)c5ccccc45.[Na+].[Na+]
[00:26:15] SMILES Parse Error: check for mistakes around position 2:
[00:26:15] O|[Hg]c1c([O-])c(Br)cc2c1Oc3cc([O-])c(Br)
[00:26:15] ~^
[00:26:15] SMILES Parse Error: Failed parsing SMILES 'O|[Hg]c1c([O-])c(Br)cc2c1Oc3cc([O-])c(Br)cc3C24OC(=O)c5ccccc45.[Na+].[Na+]' for input: 'O|[Hg]c1c([O-])c(Br)cc2c1Oc3cc([O-])c(Br)cc3C24OC(=O)c5ccccc45.[Na+].[Na+]'
[00:26:15] SMILES Parse Error: syntax error while parsing: O(|[Ag])|[Ag]
[00:26:15] SMILES Parse Error: check for mistakes around position 3:
[00:26:15] O(|[Ag])|[Ag]
[00:26:15] ~~^
[00:26:15] SMILES Parse Error: Failed parsing SMILES 'O(|[Ag])|[Ag]' for input: 'O(|[Ag])|[Ag]'
[00:26:15] SMILES Parse Error: syntax error while parsing: O(|[Ag])|[Ag]
[00:26:15] SMILES Parse Error: check for mistakes around position 3:
[00:26:15] O(|[Ag])|[Ag]
[00:26:15] ~~^
[00:26:15] SMILES Parse Error: Failed parsing SMILES

In [7]:
from sklearn.metrics import mean_absolute_error, r2_score
from tqdm import tqdm

train_loader = train_enc_loader
val_loader   = val_enc_loader

def compute_metrics_simple(y_true, y_pred):
    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return r2, mae

# ===== 初始化 encoder =====
final_text_model = ChemBERTaRegressor(
    MODEL_DIR,
    freeze_embeddings=False,
    freeze_n_layers=0,   # 你可以调成 6，看你想不想多冻几层
    dropout=0.3,
).to(device)

# ===== 损失函数 =====
reg_criterion = nn.SmoothL1Loss()   # 你原来就是这个，beta 默认 1.0 也可以

# ===== 优化器 =====
optimizer = AdamW(final_text_model.parameters(), lr=5e-5, weight_decay=0.01)

# ===== OneCycleLR 调度器 =====
num_epochs = 100
patience = 5
best_val_loss = float("inf")
early_stop_counter = 0

scheduler = OneCycleLR(
    optimizer,
    max_lr=2e-3,
    steps_per_epoch=len(train_loader),
    epochs=num_epochs,
)

best_state_dict = None
best_epoch = -1

for epoch in range(num_epochs):
    # --------- 训练 ---------
    final_text_model.train()
    running_reg_loss = 0.0

    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch [{epoch+1}/{num_epochs}] - Training")
    for input_ids, attention_mask, reg_labels in train_loader_tqdm:
        input_ids  = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        reg_labels = reg_labels.to(device)

        optimizer.zero_grad()
        # ChemBERTaRegressor 的 forward 是 (input_ids, attention_mask)
        reg_output = final_text_model(input_ids=input_ids,
                                      attention_mask=attention_mask)

        reg_loss = reg_criterion(reg_output.squeeze(-1), reg_labels)
        reg_loss.backward()
        optimizer.step()
        scheduler.step()

        running_reg_loss += reg_loss.item() * reg_labels.size(0)
        train_loader_tqdm.set_postfix({'Reg Loss': reg_loss.item()})

    avg_reg_loss = running_reg_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Reg Loss: {avg_reg_loss:.4f}")

    # --------- 验证 ---------
    final_text_model.eval()
    val_reg_loss = 0.0
    all_preds = []
    all_labels = []

    val_loader_tqdm = tqdm(val_loader, desc=f"Epoch [{epoch+1}/{num_epochs}] - Validation")
    with torch.no_grad():
        for input_ids, attention_mask, reg_labels in val_loader_tqdm:
            input_ids  = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            reg_labels = reg_labels.to(device)

            reg_output = final_text_model(input_ids=input_ids,
                                          attention_mask=attention_mask)

            reg_loss = reg_criterion(reg_output.squeeze(-1), reg_labels)
            val_reg_loss += reg_loss.item() * reg_labels.size(0)

            all_preds.extend(reg_output.squeeze(-1).cpu().numpy())
            all_labels.extend(reg_labels.cpu().numpy())

            val_loader_tqdm.set_postfix({'Reg Loss': reg_loss.item()})

    avg_val_reg_loss = val_reg_loss / len(val_loader.dataset)
    val_mae = mean_absolute_error(all_labels, all_preds)
    val_r2  = r2_score(all_labels, all_preds)

    print(
        f"Val Reg Loss: {avg_val_reg_loss:.4f}, "
        f"Val MAE: {val_mae:.4f}, "
        f"Val R²: {val_r2:.4f}, "
        f"Early Stop Counter: {early_stop_counter}"
    )

    # --------- 早停 & 保存最好模型 ---------
    # 稍微加一个 1e-4 的裕量，避免浮动抖动
    if avg_val_reg_loss < best_val_loss - 1e-4:
        best_val_loss = avg_val_reg_loss
        early_stop_counter = 0
        best_epoch = epoch + 1

        # 1) 内存里保存一份，后面直接 load_state_dict 用
        best_state_dict = {
            k: v.detach().cpu().clone()
            for k, v in final_text_model.state_dict().items()
        }
        # 2) 硬盘也存一份（你原来的习惯）
        torch.save(best_state_dict, SMILES_OUT_DIR / "best_chemberta_regressor.pth")
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"早停触发，在第 {epoch+1} 个 epoch 停止训练")
            break

print(f"\n✅ BERT 训练完成，best_epoch = {best_epoch}, best_val_loss = {best_val_loss:.4f}")

# 恢复到最佳 checkpoint（如果你后面要用它来抽 embedding）
if best_state_dict is not None:
    final_text_model.load_state_dict(best_state_dict)
    final_text_model.to(device)
    final_text_model.eval()
    print("✅ 已加载最佳验证损失对应的模型权重，用于后续提取 CLS embeddings。")


Some weights of RobertaModel were not initialized from the model checkpoint at /root/多模态/model and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch [1/100] - Training: 100%|██████████| 46/46 [00:29<00:00,  1.56it/s, Reg Loss=0.305]


Epoch [1/100], Train Reg Loss: 0.5609


Epoch [1/100] - Validation: 100%|██████████| 12/12 [00:02<00:00,  4.53it/s, Reg Loss=0.408]


Val Reg Loss: 0.5038, Val MAE: 0.8710, Val R²: 0.1786, Early Stop Counter: 0


Epoch [2/100] - Training: 100%|██████████| 46/46 [00:35<00:00,  1.28it/s, Reg Loss=0.277]


Epoch [2/100], Train Reg Loss: 0.4539


Epoch [2/100] - Validation: 100%|██████████| 12/12 [00:03<00:00,  3.64it/s, Reg Loss=0.263]


Val Reg Loss: 0.4883, Val MAE: 0.8510, Val R²: 0.1648, Early Stop Counter: 0


Epoch [3/100] - Training: 100%|██████████| 46/46 [00:44<00:00,  1.04it/s, Reg Loss=0.371]


Epoch [3/100], Train Reg Loss: 0.3942


Epoch [3/100] - Validation: 100%|██████████| 12/12 [00:06<00:00,  2.00it/s, Reg Loss=0.396]


Val Reg Loss: 0.4681, Val MAE: 0.8264, Val R²: 0.2125, Early Stop Counter: 0


Epoch [4/100] - Training: 100%|██████████| 46/46 [00:38<00:00,  1.19it/s, Reg Loss=0.437]


Epoch [4/100], Train Reg Loss: 0.3551


Epoch [4/100] - Validation: 100%|██████████| 12/12 [00:03<00:00,  3.25it/s, Reg Loss=0.322]


Val Reg Loss: 0.5327, Val MAE: 0.9018, Val R²: 0.0598, Early Stop Counter: 0


Epoch [5/100] - Training: 100%|██████████| 46/46 [00:38<00:00,  1.18it/s, Reg Loss=0.344]


Epoch [5/100], Train Reg Loss: 0.3568


Epoch [5/100] - Validation: 100%|██████████| 12/12 [00:03<00:00,  3.53it/s, Reg Loss=0.277]


Val Reg Loss: 0.4778, Val MAE: 0.8294, Val R²: 0.1702, Early Stop Counter: 1


Epoch [6/100] - Training: 100%|██████████| 46/46 [00:41<00:00,  1.11it/s, Reg Loss=0.371]


Epoch [6/100], Train Reg Loss: 0.3492


Epoch [6/100] - Validation: 100%|██████████| 12/12 [00:04<00:00,  2.86it/s, Reg Loss=0.431]


Val Reg Loss: 0.4614, Val MAE: 0.8299, Val R²: 0.2746, Early Stop Counter: 2


Epoch [7/100] - Training: 100%|██████████| 46/46 [00:39<00:00,  1.18it/s, Reg Loss=0.469]


Epoch [7/100], Train Reg Loss: 0.3677


Epoch [7/100] - Validation: 100%|██████████| 12/12 [00:02<00:00,  4.68it/s, Reg Loss=0.464]


Val Reg Loss: 0.5058, Val MAE: 0.8707, Val R²: 0.1418, Early Stop Counter: 0


Epoch [8/100] - Training: 100%|██████████| 46/46 [00:49<00:00,  1.08s/it, Reg Loss=0.263]


Epoch [8/100], Train Reg Loss: 0.5431


Epoch [8/100] - Validation: 100%|██████████| 12/12 [00:11<00:00,  1.08it/s, Reg Loss=0.87] 


Val Reg Loss: 0.6111, Val MAE: 1.0061, Val R²: -0.0004, Early Stop Counter: 1


Epoch [9/100] - Training: 100%|██████████| 46/46 [00:59<00:00,  1.29s/it, Reg Loss=0.64] 


Epoch [9/100], Train Reg Loss: 0.5626


Epoch [9/100] - Validation: 100%|██████████| 12/12 [00:10<00:00,  1.18it/s, Reg Loss=0.502]


Val Reg Loss: 0.6150, Val MAE: 1.0090, Val R²: -0.0958, Early Stop Counter: 2


Epoch [10/100] - Training: 100%|██████████| 46/46 [01:06<00:00,  1.45s/it, Reg Loss=0.396]


Epoch [10/100], Train Reg Loss: 0.5561


Epoch [10/100] - Validation: 100%|██████████| 12/12 [00:11<00:00,  1.06it/s, Reg Loss=0.642]


Val Reg Loss: 0.6014, Val MAE: 0.9939, Val R²: -0.0321, Early Stop Counter: 3


Epoch [11/100] - Training: 100%|██████████| 46/46 [00:55<00:00,  1.21s/it, Reg Loss=0.647]


Epoch [11/100], Train Reg Loss: 0.5533


Epoch [11/100] - Validation: 100%|██████████| 12/12 [00:07<00:00,  1.52it/s, Reg Loss=0.959]


Val Reg Loss: 0.6235, Val MAE: 1.0212, Val R²: -0.0074, Early Stop Counter: 4
早停触发，在第 11 个 epoch 停止训练

✅ BERT 训练完成，best_epoch = 6, best_val_loss = 0.4614
✅ 已加载最佳验证损失对应的模型权重，用于后续提取 CLS embeddings。


In [8]:
print("\n==== 使用训练好的 encoder 提取全体样本的 CLS 词嵌入 ====")

def extract_cls_embeddings(model, smiles, targets, tokenizer, device, batch_size=128, max_length=512):
    dataset = SMILESDataset(smiles, targets, tokenizer, max_length=max_length)
    loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
    )

    model.eval()
    all_embeds = []
    with torch.inference_mode():
        for input_ids, attention_mask, labels in loader:
            input_ids      = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            cls_emb = model.get_cls_embedding(input_ids, attention_mask)  # (B, H)
            all_embeds.append(cls_emb.cpu().numpy())

    all_embeds = np.concatenate(all_embeds, axis=0)
    return all_embeds

smiles_all  = df["SMILES_Canonical_RDKit"].astype(str).tolist()
targets_all = df["mgperL_log"].values

emb_all = extract_cls_embeddings(
    final_text_model,
    smiles_all,
    targets_all,
    tokenizer,
    device,
    batch_size=128,
    max_length=MAX_LEN,
)

print("CLS 嵌入形状:", emb_all.shape)
np.save(EMB_ALL_PATH, emb_all)
print(f"✅ All-data CLS embeddings 已保存到: {EMB_ALL_PATH}")



==== 使用训练好的 encoder 提取全体样本的 CLS 词嵌入 ====
CLS 嵌入形状: (3620, 768)
✅ All-data CLS embeddings 已保存到: /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs/reg_smiles_cls_embeddings_all.npy


In [9]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# ========= 5. RF 端：独立按 SMILES 分组做 8:2 划分 + 拼接 meta =========

# 1) 文本端 CLS 嵌入（刚刚保存过，这里可以直接用 emb_all，也可以从文件读）
X_text_all = np.load(EMB_ALL_PATH)        # (N, d_text)
y_all      = df["mgperL_log"].values      # (N,)
groups_all = df["SMILES_Canonical_RDKit"].astype(str).values

print("X_text_all 形状:", X_text_all.shape)
print("y_all 形状:", y_all.shape)

# 2) 指定你想用的 meta 列
#    —— 这里先按你说的 duration / effect / endpoint 来写
#    —— 会自动过滤掉 df 里不存在的列，避免 KeyError
NUM_META_COLS_CANDIDATE = ["Duration_Value(hour)"]          # 数值型
CAT_META_COLS_CANDIDATE = ["Effect", "Endpoint"]      # 类别型

NUM_META_COLS = [c for c in NUM_META_COLS_CANDIDATE if c in df.columns]
CAT_META_COLS = [c for c in CAT_META_COLS_CANDIDATE if c in df.columns]

print("数值型 meta 列:", NUM_META_COLS)
print("类别型 meta 列:", CAT_META_COLS)

# 3) 先按 SMILES 做 8:2 外层划分（这里 X 用个占位，不用真的特征避免混淆）
gss_rf = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=2025)
dummy_X = np.zeros((len(df), 1))  # 只为了满足 split 的接口
train_idx_rf, test_idx_rf = next(gss_rf.split(dummy_X, y_all, groups=groups_all))

y_train = y_all[train_idx_rf]
y_test  = y_all[test_idx_rf]
groups_train = groups_all[train_idx_rf]

# 文本 CLS 部分按索引切
X_text_train = X_text_all[train_idx_rf]
X_text_test  = X_text_all[test_idx_rf]

# 4) 构建 meta 的 DataFrame（先只切原始 df，后面再做编码）
df_train_meta = df.iloc[train_idx_rf].copy()
df_test_meta  = df.iloc[test_idx_rf].copy()

# 5) 数值型 meta：StandardScaler（只在 train 上 fit）
if len(NUM_META_COLS) > 0:
    scaler = StandardScaler()
    X_num_train = scaler.fit_transform(df_train_meta[NUM_META_COLS].values)
    X_num_test  = scaler.transform(df_test_meta[NUM_META_COLS].values)
else:
    # 如果一个数值列也没有，就给个空数组占位
    X_num_train = np.zeros((len(df_train_meta), 0), dtype=np.float32)
    X_num_test  = np.zeros((len(df_test_meta), 0), dtype=np.float32)

# 6) 类别型 meta：OneHotEncoder（同样只在 train 上 fit）
if len(CAT_META_COLS) > 0:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat_train = ohe.fit_transform(df_train_meta[CAT_META_COLS].astype(str))
    X_cat_test  = ohe.transform(df_test_meta[CAT_META_COLS].astype(str))
else:
    X_cat_train = np.zeros((len(df_train_meta), 0), dtype=np.float32)
    X_cat_test  = np.zeros((len(df_test_meta), 0), dtype=np.float32)

# 7) 把 meta 数值 + meta one-hot 拼在一起
X_meta_train = np.concatenate([X_num_train, X_cat_train], axis=1)
X_meta_test  = np.concatenate([X_num_test,  X_cat_test],  axis=1)

print("X_meta_train 形状:", X_meta_train.shape)
print("X_meta_test  形状:", X_meta_test.shape)

# 8) 最终特征 = 文本 CLS + meta
X_train = np.concatenate([X_text_train, X_meta_train], axis=1)
X_test  = np.concatenate([X_text_test,  X_meta_test],  axis=1)

print("RF 最终 X_train 形状:", X_train.shape)
print("RF 最终 X_test  形状:", X_test.shape)
print("RF 训练集样本数:", len(y_train))
print("RF 测试集样本数:", len(y_test))


X_text_all 形状: (3620, 768)
y_all 形状: (3620,)
数值型 meta 列: ['Duration_Value(hour)']
类别型 meta 列: ['Effect', 'Endpoint']
X_meta_train 形状: (2889, 4)
X_meta_test  形状: (731, 4)
RF 最终 X_train 形状: (2889, 772)
RF 最终 X_test  形状: (731, 772)
RF 训练集样本数: 2889
RF 测试集样本数: 731


In [10]:
from sklearn.ensemble import RandomForestRegressor

# ===== 基础 RF 模型 =====
rf_base = RandomForestRegressor(
    n_jobs=-1,
    random_state=42,
)

# ===== 超参数搜索空间 =====
param_distributions = {
    "n_estimators":      randint(200, 1001),         # 200 ~ 1000
    "max_depth":         [None, 10, 20, 30, 40],
    "min_samples_split": randint(2, 11),             # 2 ~ 10
    "min_samples_leaf":  randint(1, 5),              # 1 ~ 4
    "max_features":      ["sqrt", "log2", 0.5, 0.8],
}

gkf = GroupKFold(n_splits=10)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_distributions,
    n_iter=30,
    scoring="r2",
    cv=gkf,
    random_state=42,
    n_jobs=-1,
    verbose=2,
)

print("\n==== 开始在 RF 训练集上做十折随机搜索 (GroupKFold) ====")
rf_search.fit(X_train, y_train, groups=groups_train)

best_params   = rf_search.best_params_
best_cv_score = rf_search.best_score_

print("\n=== RF 超参搜索完成 ===")
print("Best params:", best_params)
print(f"Best CV R^2 (train 10-fold): {best_cv_score:.4f}")



==== 开始在 RF 训练集上做十折随机搜索 (GroupKFold) ====
Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=3, min_samples_split=9, n_estimators=900; total time= 2.4min
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=4, min_samples_split=6, n_estimators=299; total time=12.3min
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=543; total time= 1.5min
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=3, n_estimators=543; total time= 1.6min
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=7, n_estimators=452; total time= 2.6min
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=766; total time= 6.3min
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=10, n_estimators=366; total time=32.7min
[CV] END max_depth=10, max_features=0.8, min_samples_le

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import numpy as np
import json

# ===== 1) 基于最优超参，在同一套 10 折上计算 train OOF 预测 =====
print("\n==== 基于最优超参，在同一套 10 折上计算 train OOF 预测 ====")

# 用和 RandomizedSearch 一样的 GroupKFold 规则重建索引
# （GroupKFold 是确定性的，所以这套折和调参时的是同一套）
cv_indices_rf = list(gkf.split(X_train, y_train, groups=groups_train))

oof_pred_train = np.zeros_like(y_train, dtype=float)

for fold_idx, (tr_idx, val_idx) in enumerate(cv_indices_rf, 1):
    print(f"  -> OOF fold {fold_idx} / {len(cv_indices_rf)}")
    rf_fold = RandomForestRegressor(
        n_jobs=-1,
        random_state=2025 + fold_idx,
        **best_params,
    )
    rf_fold.fit(X_train[tr_idx], y_train[tr_idx])
    oof_pred_train[val_idx] = rf_fold.predict(X_train[val_idx])

# OOF 指标
r2_oof   = r2_score(y_train, oof_pred_train)
mae_oof  = mean_absolute_error(y_train, oof_pred_train)
mse_oof  = mean_squared_error(y_train, oof_pred_train)
rmse_oof = np.sqrt(mse_oof)
r_oof, _ = pearsonr(y_train, oof_pred_train)

print("\n===== RF 文本端：train OOF 表现（基于 BERT CLS+meta）=====")
print(f"OOF R^2  = {r2_oof:.4f}")
print(f"OOF MAE  = {mae_oof:.4f}")
print(f"OOF RMSE = {rmse_oof:.4f}")
print(f"OOF R    = {r_oof:.4f}")

# ===== 2) 用最优超参重新在整个训练集上拟合一个 RF =====
best_rf = RandomForestRegressor(
    n_jobs=-1,
    random_state=2025,
    **best_params,
)

best_rf.fit(X_train, y_train)

# ===== 3) 在训练集和独立测试集上评估 =====
y_pred_train = best_rf.predict(X_train)
y_pred_test  = best_rf.predict(X_test)

# 训练集指标（可选，但顺手算一下）
r2_train   = r2_score(y_train, y_pred_train)
mae_train  = mean_absolute_error(y_train, y_pred_train)
mse_train  = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)
r_train, _ = pearsonr(y_train, y_pred_train)

print("\n===== RF 文本端：训练集表现（基于 BERT CLS+meta）=====")
print(f"Train R^2  = {r2_train:.4f}")
print(f"Train MAE  = {mae_train:.4f}")
print(f"Train RMSE = {rmse_train:.4f}")
print(f"Train R    = {r_train:.4f}")

# 测试集指标（保留你原来的输出）
r2_test   = r2_score(y_test, y_pred_test)
mae_test  = mean_absolute_error(y_test, y_pred_test)
mse_test  = mean_squared_error(y_test, y_pred_test)  # 这里返回的是 MSE
rmse_test = np.sqrt(mse_test)                        # 手动转成 RMSE
r_test, _ = pearsonr(y_test, y_pred_test)

print("\n===== RF 文本端：独立测试集表现（基于 BERT CLS+meta）=====")
print(f"Test R^2  = {r2_test:.4f}")
print(f"Test MAE  = {mae_test:.4f}")
print(f"Test RMSE = {rmse_test:.4f}")
print(f"Test R    = {r_test:.4f}")

print("\n===== RF 文本端：独立测试集表现（基于 BERT CLS 嵌入）=====")
print(f"Test R^2  = {r2_test:.4f}")
print(f"Test MAE  = {mae_test:.4f}")
print(f"Test RMSE = {rmse_test:.4f}")
print(f"Test R    = {r_test:.4f}")

# ===== 4) 保存一些结果（方便后续融合/画图）=====

metrics_rf = {
    "train_metrics": {
        "r2":   float(r2_train),
        "mae":  float(mae_train),
        "rmse": float(rmse_train),
        "r":    float(r_train),
    },
    "test_metrics": {   # 原来的不能丢
        "r2":   float(r2_test),
        "mae":  float(mae_test),
        "rmse": float(rmse_test),
        "r":    float(r_test),
    },
    "oof_metrics": {    # 新增：OOF 指标
        "r2":   float(r2_oof),
        "mae":  float(mae_oof),
        "rmse": float(rmse_oof),
        "r":    float(r_oof),
    },
    "cv_search": {
        "best_cv_r2": float(best_cv_score),
        "best_params": {
            k: (int(v) if isinstance(v, np.integer) else v)
            for k, v in best_params.items()
        },
    },
}

with open(SMILES_OUT_DIR / "rf_text_pipeline_metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics_rf, f, indent=2, ensure_ascii=False)

# 保存特征和标签（保留你原来的）
np.save(SMILES_OUT_DIR / "rf_text_X_train.npy", X_train.astype(np.float32))
np.save(SMILES_OUT_DIR / "rf_text_y_train.npy", y_train.astype(np.float32))
np.save(SMILES_OUT_DIR / "rf_text_X_test.npy",  X_test.astype(np.float32))
np.save(SMILES_OUT_DIR / "rf_text_y_test.npy",  y_test.astype(np.float32))

# 🔹 新增：专门给后期融合读取用的文件
np.save(SMILES_OUT_DIR / "rf_text_oof_pred_train.npy", oof_pred_train.astype(np.float32))
np.save(SMILES_OUT_DIR / "rf_text_y_pred_test.npy",    y_pred_test.astype(np.float32))
np.save(SMILES_OUT_DIR / "rf_text_train_idx.npy",      train_idx_rf.astype(np.int64))  # df 行号
np.save(SMILES_OUT_DIR / "rf_text_test_idx.npy",       test_idx_rf.astype(np.int64))

print("\n✅ RF 相关数据和指标已保存到:", SMILES_OUT_DIR)
print("   - rf_text_y_train.npy / rf_text_y_test.npy")
print("   - rf_text_oof_pred_train.npy  (train OOF，用于 late fusion)")
print("   - rf_text_y_pred_test.npy     (test 预测，用于 late fusion)")
print("   - rf_text_train_idx.npy / rf_text_test_idx.npy (与 df 行号对应)")



==== 基于最优超参，在同一套 10 折上计算 train OOF 预测 ====
  -> OOF fold 1 / 10
  -> OOF fold 2 / 10
  -> OOF fold 3 / 10
  -> OOF fold 4 / 10
  -> OOF fold 5 / 10
  -> OOF fold 6 / 10
  -> OOF fold 7 / 10
  -> OOF fold 8 / 10
  -> OOF fold 9 / 10
  -> OOF fold 10 / 10

===== RF 文本端：train OOF 表现（基于 BERT CLS+meta）=====
OOF R^2  = 0.5418
OOF MAE  = 0.5775
OOF RMSE = 0.7993
OOF R    = 0.7362

===== RF 文本端：训练集表现（基于 BERT CLS+meta）=====
Train R^2  = 0.8846
Train MAE  = 0.2740
Train RMSE = 0.4011
Train R    = 0.9452

===== RF 文本端：独立测试集表现（基于 BERT CLS+meta）=====
Test R^2  = 0.5395
Test MAE  = 0.6097
Test RMSE = 0.8863
Test R    = 0.7388

===== RF 文本端：独立测试集表现（基于 BERT CLS 嵌入）=====
Test R^2  = 0.5395
Test MAE  = 0.6097
Test RMSE = 0.8863
Test R    = 0.7388

✅ RF 相关数据和指标已保存到: /root/Invertebrates_EC50_multi_fusion/SMILES/smiles_outputs
   - rf_text_y_train.npy / rf_text_y_test.npy
   - rf_text_oof_pred_train.npy  (train OOF，用于 late fusion)
   - rf_text_y_pred_test.npy     (test 预测，用于 late fusion)
   - rf_text_trai