In [8]:
import pandas as pd
import numpy as np
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski
from rdkit.Chem.rdMolDescriptors import (
    CalcLabuteASA, CalcTPSA, CalcNumAromaticRings,
    CalcFractionCSP3, CalcKappa1, CalcKappa2, CalcKappa3,
)
from rdkit.Chem import GraphDescriptors
from rdkit.Chem.EState import EStateIndices
from rdkit.Chem import AllChem

from tqdm import tqdm

# ========= 你需要改的地方 =========
# 1) 你的 Excel 所在文件夹
INPUT_DIR = Path("/root/fusion_dataset")  # TODO: 改成你的文件夹路径

# 2) SMILES 列名（要求所有 Excel 里都一样）
SMILES_COL = "SMILES_Canonical_RDKit"          # TODO: 改成你自己的列名

# 3) 输出文件夹（可以是同一个路径，也可以新开一个）
OUTPUT_DIR = INPUT_DIR / "with_physchem_excels"   # 会在原目录下新建一个子文件夹
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# ==================================

print("输入文件夹:", INPUT_DIR)
print("输出文件夹:", OUTPUT_DIR)


输入文件夹: /root/fusion_dataset
输出文件夹: /root/fusion_dataset/with_physchem_excels


In [9]:
def smiles_to_mol(smiles: str):
    if pd.isna(smiles):
        return None
    try:
        return Chem.MolFromSmiles(str(smiles))
    except Exception:
        return None


def compute_physchem(mol):
    """对单个 RDKit Mol 计算一批理化性质，返回 dict。列名用 DESC_/KIER_/ESTATE_ 前缀。"""
    if mol is None:
        # 返回一堆 NaN 占位
        return {
            # DESC_ 系列
            "DESC_MolWt": np.nan,
            "DESC_ExactMolWt": np.nan,
            "DESC_HeavyAtomCount": np.nan,
            "DESC_RingCount": np.nan,
            "DESC_NumAromaticRings": np.nan,
            "DESC_FractionCSP3": np.nan,
            "DESC_MolLogP": np.nan,
            "DESC_TPSA": np.nan,
            "DESC_ASA_Labute": np.nan,
            "DESC_HBA": np.nan,
            "DESC_HBD": np.nan,
            "DESC_RotatableBonds": np.nan,
            "DESC_FormalCharge": np.nan,
            "DESC_MaxAbsPartialCharge": np.nan,
            "DESC_MinAbsPartialCharge": np.nan,
            # KIER_ 系列
            "KIER_Kappa1": np.nan,
            "KIER_Kappa2": np.nan,
            "KIER_Kappa3": np.nan,
            "KIER_Chi0v": np.nan,
            "KIER_Chi1v": np.nan,
            "KIER_Chi2v": np.nan,
            # ESTATE_ 系列
            "ESTATE_mean": np.nan,
            "ESTATE_std": np.nan,
            "ESTATE_sum": np.nan,
        }

    # ===== DESC_ 部分 =====
    MolWt            = Descriptors.MolWt(mol)
    ExactMolWt       = Descriptors.ExactMolWt(mol)
    HeavyAtomCount   = Descriptors.HeavyAtomCount(mol)
    RingCount        = Descriptors.RingCount(mol)
    NumAromaticRings = CalcNumAromaticRings(mol)
    FractionCSP3_val = CalcFractionCSP3(mol)

    MolLogP_val  = Crippen.MolLogP(mol)
    TPSA_val     = CalcTPSA(mol)
    ASA_Labute = CalcLabuteASA(mol)

    HBA = Lipinski.NumHAcceptors(mol)
    HBD = Lipinski.NumHDonors(mol)
    RotB = Lipinski.NumRotatableBonds(mol)

    FormalCharge_val = Chem.GetFormalCharge(mol)

    # Gasteiger 部分电荷
    try:
        mol_h = Chem.AddHs(mol)
        AllChem.ComputeGasteigerCharges(mol_h)
        charges = [float(a.GetProp("_GasteigerCharge")) for a in mol_h.GetAtoms()]
        MaxAbsQ = max(abs(c) for c in charges)
        MinAbsQ = min(abs(c) for c in charges)
    except Exception:
        MaxAbsQ = np.nan
        MinAbsQ = np.nan

    # ===== KIER_ 部分（Kappa & Chi）=====
    try:
        K1 = CalcKappa1(mol)
        K2 = CalcKappa2(mol)
        K3 = CalcKappa3(mol)

        Chi0v = GraphDescriptors.Chi0v(mol)
        Chi1v = GraphDescriptors.Chi1v(mol)
        Chi2v = GraphDescriptors.Chi2v(mol)
    except Exception:
        K1 = K2 = K3 = np.nan
        Chi0v = Chi1v = Chi2v = np.nan

    # ===== ESTATE_ 部分 =====
    try:
        estate_vals = EStateIndices(mol)  # 每个原子一个 EState 值
        if len(estate_vals) > 0:
            EST_mean = float(np.mean(estate_vals))
            EST_std  = float(np.std(estate_vals))
            EST_sum  = float(np.sum(estate_vals))
        else:
            EST_mean = EST_std = EST_sum = np.nan
    except Exception:
        EST_mean = EST_std = EST_sum = np.nan

    return {
        # DESC
        "DESC_MolWt": MolWt,
        "DESC_ExactMolWt": ExactMolWt,
        "DESC_HeavyAtomCount": HeavyAtomCount,
        "DESC_RingCount": RingCount,
        "DESC_NumAromaticRings": NumAromaticRings,
        "DESC_FractionCSP3": FractionCSP3_val,
        "DESC_MolLogP": MolLogP_val,
        "DESC_TPSA": TPSA_val,
        "DESC_ASA_Labute": ASA_Labute,
        "DESC_HBA": HBA,
        "DESC_HBD": HBD,
        "DESC_RotatableBonds": RotB,
        "DESC_FormalCharge": FormalCharge_val,
        "DESC_MaxAbsPartialCharge": MaxAbsQ,
        "DESC_MinAbsPartialCharge": MinAbsQ,
        # KIER
        "KIER_Kappa1": K1,
        "KIER_Kappa2": K2,
        "KIER_Kappa3": K3,
        "KIER_Chi0v": Chi0v,
        "KIER_Chi1v": Chi1v,
        "KIER_Chi2v": Chi2v,
        # ESTATE
        "ESTATE_mean": EST_mean,
        "ESTATE_std": EST_std,
        "ESTATE_sum": EST_sum,
    }


In [12]:
# 找到所有 Excel 文件
excel_files = list(INPUT_DIR.glob("*.xlsx")) + list(INPUT_DIR.glob("*.xls"))
print(f"在 {INPUT_DIR} 里找到了 {len(excel_files)} 个 Excel 文件。")

for f in excel_files:
    print(f"\n===== 处理文件: {f.name} =====")
    try:
        # 1) 读入
        if f.suffix.lower() in [".xlsx", ".xls"]:
            df = pd.read_excel(f)
        else:
            print("  ⛔ 非 Excel 文件，跳过。")
            continue

        print("  原始形状:", df.shape)

        # 2) 检查 SMILES 列是否存在
        if SMILES_COL not in df.columns:
            print(f"  ⛔ 找不到 SMILES 列 '{SMILES_COL}'，跳过该文件。")
            continue

        # 3) SMILES → mol
        df["mol"] = df[SMILES_COL].apply(smiles_to_mol)
        n_invalid = df["mol"].isna().sum()
        print(f"  无效 SMILES 行数: {n_invalid}")

        # 4) 逐行计算理化性质
        physchem_list = []
        for mol in tqdm(df["mol"], desc=f"  Computing physchem for {f.name}"):
            physchem_list.append(compute_physchem(mol))

        physchem_df = pd.DataFrame(physchem_list)
        print("  理化性质形状:", physchem_df.shape)

        # 5) 拼接回原表
        df_merged = pd.concat([df.drop(columns=["mol"]), physchem_df], axis=1)
        print("  合并后形状:", df_merged.shape)

        # 6) 生成输出文件名：原名 + "_physchem.xlsx"
        out_name = f.stem + "_physchem.xlsx"
        out_path = OUTPUT_DIR / out_name

        df_merged.to_excel(out_path, index=False)
        print("  ✅ 已保存:", out_path)

    except Exception as e:
        print(f"  ❌ 处理文件 {f.name} 时出错: {e}")


在 /root/fusion_dataset 里找到了 7 个 Excel 文件。

===== 处理文件: algae_EC10_unique.xlsx =====
  原始形状: (3980, 10)


[10:55:16] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:16] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:16] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:16] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:16] SMILES Parse Error: syntax error while parsing: O|[Co](|O)|O
[10:55:16] SMILES Parse Error: check for mistakes around position 2:
[10:55:16] O|[Co](|O)|O
[10:55:16] ~^
[10:55:16] SMILES Parse Error: Failed parsing SMILES 'O|[Co](|O)|O' for input: 'O|[Co](|O)|O'
[10:55:16] SMILES Parse Error: syntax error while parsing: [NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[Cl].[Cl]
[10:55:16] SMILES Parse Error: check for mistakes around position 7:
[10:55:16] [NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[C
[10:55:16] ~~~~~~^
[10:55:16] SMILES Parse Error: Failed parsing SMILES '[NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[Cl].[Cl]' for input: '[NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[Cl].[Cl]'
[10:55:16] SMILES Pa

  无效 SMILES 行数: 43


  Computing physchem for algae_EC10_unique.xlsx: 100%|██████████| 3980/3980 [00:03<00:00, 1007.92it/s]


  理化性质形状: (3980, 24)
  合并后形状: (3980, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/algae_EC10_unique_physchem.xlsx

===== 处理文件: algae_EC50_unique.xlsx =====
  原始形状: (4718, 10)


[10:55:24] SMILES Parse Error: syntax error while parsing: CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC
[10:55:24] SMILES Parse Error: check for mistakes around position 10:
[10:55:24] CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O
[10:55:24] ~~~~~~~~~^
[10:55:24] SMILES Parse Error: Failed parsing SMILES 'CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC' for input: 'CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC'
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:24] Explicit valence for atom # 0

  无效 SMILES 行数: 63


  Computing physchem for algae_EC50_unique.xlsx: 100%|██████████| 4718/4718 [00:04<00:00, 1107.31it/s]


  理化性质形状: (4718, 24)
  合并后形状: (4718, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/algae_EC50_unique_physchem.xlsx

===== 处理文件: Aqutic_unique.xlsx =====
  原始形状: (28461, 10)


[10:55:34] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:34] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:34] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:34] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:55:34] SMILES Parse Error: syntax error while parsing: O|[Co](|O)|O
[10:55:34] SMILES Parse Error: check for mistakes around position 2:
[10:55:34] O|[Co](|O)|O
[10:55:34] ~^
[10:55:34] SMILES Parse Error: Failed parsing SMILES 'O|[Co](|O)|O' for input: 'O|[Co](|O)|O'
[10:55:34] SMILES Parse Error: syntax error while parsing: [NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[Cl].[Cl]
[10:55:34] SMILES Parse Error: check for mistakes around position 7:
[10:55:34] [NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[C
[10:55:34] ~~~~~~^
[10:55:34] SMILES Parse Error: Failed parsing SMILES '[NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[Cl].[Cl]' for input: '[NH2-]|[Pd++](|[NH2-])(|[NH2-])|[NH2-].[Cl].[Cl]'
[10:55:34] SMILES Pa

  无效 SMILES 行数: 448


  Computing physchem for Aqutic_unique.xlsx: 100%|██████████| 28461/28461 [00:28<00:00, 1012.12it/s]


  理化性质形状: (28461, 24)
  合并后形状: (28461, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/Aqutic_unique_physchem.xlsx

===== 处理文件: fish_EC10_unique.xlsx =====
  原始形状: (8996, 10)


[10:56:24] SMILES Parse Error: syntax error while parsing: CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC
[10:56:24] SMILES Parse Error: check for mistakes around position 10:
[10:56:24] CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O
[10:56:24] ~~~~~~~~~^
[10:56:24] SMILES Parse Error: Failed parsing SMILES 'CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC' for input: 'CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC'
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:24] Explicit valence for atom # 0

  无效 SMILES 行数: 83


  Computing physchem for fish_EC10_unique.xlsx: 100%|██████████| 8996/8996 [00:09<00:00, 922.46it/s] 


  理化性质形状: (8996, 24)
  合并后形状: (8996, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/fish_EC10_unique_physchem.xlsx

===== 处理文件: fish_EC50_unique.xlsx =====
  原始形状: (3191, 10)


[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:41] SMILES Parse Error: syntax error while parsing: O|[Co](|O)|O
[10:56:41] SMILES Parse Error: check for mistakes ar

  无效 SMILES 行数: 72


  Computing physchem for fish_EC50_unique.xlsx: 100%|██████████| 3191/3191 [00:02<00:00, 1108.18it/s]


  理化性质形状: (3191, 24)
  合并后形状: (3191, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/fish_EC50_unique_physchem.xlsx

===== 处理文件: Invertebrates_EC10_unique.xlsx =====
  原始形状: (3954, 10)


[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is greater than permitted
[10:56:47] Explicit valence for atom # 0 O, 2, is g

  无效 SMILES 行数: 83


  Computing physchem for Invertebrates_EC10_unique.xlsx: 100%|██████████| 3954/3954 [00:04<00:00, 986.38it/s] 


  理化性质形状: (3954, 24)
  合并后形状: (3954, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/Invertebrates_EC10_unique_physchem.xlsx

===== 处理文件: Invertebrates_EC50_unique.xlsx =====
  原始形状: (3620, 10)


[10:56:54] SMILES Parse Error: syntax error while parsing: CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC
[10:56:54] SMILES Parse Error: check for mistakes around position 10:
[10:56:54] CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O
[10:56:54] ~~~~~~~~~^
[10:56:54] SMILES Parse Error: Failed parsing SMILES 'CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC' for input: 'CCCC[Sn](|[S]CC(=O)OCC(CC)CCCC)(|[S]CC(=O)OCC(CC)CCCC)CCCC'
[10:56:54] SMILES Parse Error: syntax error while parsing: [Cl]|[Sn](|[Cl])(|[Cl])CCCC
[10:56:54] SMILES Parse Error: check for mistakes around position 5:
[10:56:54] [Cl]|[Sn](|[Cl])(|[Cl])CCCC
[10:56:54] ~~~~^
[10:56:54] SMILES Parse Error: Failed parsing SMILES '[Cl]|[Sn](|[Cl])(|[Cl])CCCC' for input: '[Cl]|[Sn](|[Cl])(|[Cl])CCCC'
[10:56:54] SMILES Parse Error: syntax error while parsing: CCCCCCCCCCCC[S]|[Sn](|[S]CCCCCCCCCCCC)(CCCC)CCCC
[10:56:54] SMILES Parse Error: check for mistakes around position 16:
[10:56:54] CCCCCCCCCCCC[S]|[Sn](|[S]C

  无效 SMILES 行数: 104


  Computing physchem for Invertebrates_EC50_unique.xlsx: 100%|██████████| 3620/3620 [00:03<00:00, 1159.95it/s]


  理化性质形状: (3620, 24)
  合并后形状: (3620, 34)
  ✅ 已保存: /root/fusion_dataset/with_physchem_excels/Invertebrates_EC50_unique_physchem.xlsx
