In [3]:
'''
import sys
!{sys.executable} -m pip install rdkit-pypi
'''

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m34.7 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import MolStandardize
from rdkit.Chem.rdmolops import RemoveHs

OUTPUT_DIR = Path("/ShangGaoAIProjects/Lingge/LINCS/data/Processed_data")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
# 读入你保存的 metadata
meta = pd.read_parquet(OUTPUT_DIR / "l1000_signatures_metadata.parquet")


In [3]:
disconnector = MolStandardize.metal.MetalDisconnector()
normalizer   = MolStandardize.normalize.Normalizer()
lfc          = MolStandardize.fragment.LargestFragmentChooser()
uncharger    = MolStandardize.charge.Uncharger()

In [4]:
df = meta.copy()
df = df[~df["smiles"].eq("restricted")].copy()
print("过滤 restricted 后剩余:", df.shape)

def canon_basic(smi: str):
    try:
        mol = Chem.MolFromSmiles(smi)   # 直接解析（含标准 sanitize）
        if mol is None: 
            return None
        return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
    except Exception:
        return None

tqdm.pandas(desc="canon_basic")
df["smiles_canonical"] = df["smiles"].progress_apply(canon_basic)

ok = df["smiles_canonical"].notna().sum()
print(f"canon_basic 通过: {ok}/{len(df)} ({ok/len(df):.2%})")

过滤 restricted 后剩余: (209540, 25)


canon_basic: 100%|████████████████████████████████████████████████████████████| 209540/209540 [00:44<00:00, 4704.16it/s]

canon_basic 通过: 209540/209540 (100.00%)





In [6]:
df_ok = df

In [7]:
from rdkit.Chem import MolStandardize
lfc = MolStandardize.fragment.LargestFragmentChooser()

def choose_largest_fragment_if_needed(smi_can: str):
    if smi_can is None or '.' not in smi_can:
        return smi_can
    m = Chem.MolFromSmiles(smi_can)
    if m is None:
        return smi_can
    m = lfc.choose(m)
    return Chem.MolToSmiles(m, canonical=True, isomericSmiles=False)

df_ok["smiles_canonical"] = df["smiles_canonical"].apply(choose_largest_fragment_if_needed)


In [9]:
# 清洗 inchi_key 的占位
df_ok["inchi_key"] = df_ok.get("inchi_key", np.nan).replace("-666", np.nan)

def make_compound_id(row):
    if pd.notna(row["inchi_key"]):
        return row["inchi_key"]
    return f"CANON::{row['smiles_canonical']}"

df_ok["compound_id"] = df_ok.apply(make_compound_id, axis=1)

compounds = df_ok[["compound_id","inchi_key","smiles_canonical"]].drop_duplicates()
compounds.to_parquet(OUTPUT_DIR/"compounds_unique.parquet", index=False)
df_ok.to_parquet(OUTPUT_DIR/"l1000_signatures_metadata_canonical.parquet", index=False)

print("Unique compounds:", len(compounds))
print("Saved compounds ->", OUTPUT_DIR/"compounds_unique.parquet")
print("Saved sample-level ->", OUTPUT_DIR/"l1000_signatures_metadata_canonical.parquet")


Unique compounds: 21156
Saved compounds -> /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/compounds_unique.parquet
Saved sample-level -> /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/l1000_signatures_metadata_canonical.parquet


In [11]:
mask_mix = df["smiles_canonical"].str.contains(r"\.", regex=True)

n_total = len(df)
n_mix = mask_mix.sum()

print(f"含 '.' (盐/混合物) 的条目: {n_mix}/{n_total} ({n_mix/n_total:.2%})")

# 举前几个例子
print(df.loc[mask_mix, "smiles_canonical"].head(10).tolist())

含 '.' (盐/混合物) 的条目: 0/209540 (0.00%)
[]


In [12]:
compounds

Unnamed: 0,compound_id,inchi_key,smiles_canonical
0,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21
4,RTKIYFITIVXBLE-WKWSCTOISA-N,RTKIYFITIVXBLE-WKWSCTOISA-N,CC(C=CC(=O)NO)=CC(C)C(=O)c1ccc(N(C)C)cc1
7,QTQAWLPCGQOSGP-VDYWNIEOSA-N,QTQAWLPCGQOSGP-VDYWNIEOSA-N,COC1=C2CC(C)CC(OC)C(O)C(C)C=C(C)C(OC(N)=O)C(OC...
11,PXSMEPRWQNYAIL-UHFFFAOYSA-N,PXSMEPRWQNYAIL-UHFFFAOYSA-N,Cn1ccc2c(-c3ccc(C(C)(C)C)cc3)cc3c(c21)C1CCC3O1
15,WDLPDTHFFBNPGK-UHFFFAOYSA-N,WDLPDTHFFBNPGK-UHFFFAOYSA-N,O=C(CCNC(=O)c1ccc(Cl)cc1)NC1CCCc2ccccc21
...,...,...,...
207600,QJJXYPPXXYFBGM-NYOQZLQMSA-N,QJJXYPPXXYFBGM-NYOQZLQMSA-N,C=CCC1C=C(C)CC(C)CC(OC)C2OC(O)(C(=O)C(=O)N3CCC...
207606,LWUDDYHYYNNIQI-ZDUSSCGKSA-N,LWUDDYHYYNNIQI-ZDUSSCGKSA-N,CC(C)(C)OC(=O)c1ncn2c1C1CCCN1C(=O)c1c(Br)cccc1-2
207623,WAXQNWCZJDTGBU-UHFFFAOYSA-N,WAXQNWCZJDTGBU-UHFFFAOYSA-N,Cc1ccccc1-c1cc(N2CCN(C)CC2)ncc1N(C)C(=O)C(C)(C...
207653,BJJXHLWLUDYTGC-ANULTFPQSA-N,BJJXHLWLUDYTGC-ANULTFPQSA-N,C#CC1(O)CCC2C3CCC4=CC(=O)CCC4=C3C=CC21CC


In [13]:
df_ok

Unnamed: 0,sig_id,pert_id,pert_iname,smiles,inchi_key,cell_id,cell_type,base_cell_id,modification,primary_site,...,pert_itime,phase,dose_value,dose_unit_raw,dose_uM,time_h,is_small_molecule,is_control,smiles_canonical,compound_id
0,AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,0.37037,µM,0.37037,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
1,AML001_CD34_24H:BRD-A03772856:1.11111,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,1.11111,µM,1.11111,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
2,AML001_CD34_24H:BRD-A03772856:10,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,10.00000,µM,10.00000,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
3,AML001_CD34_24H:BRD-A03772856:3.33333,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,3.33333,µM,3.33333,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
4,AML001_CD34_24H:BRD-A19037878:1.11111,BRD-A19037878,trichostatin-a,CC(\C=C(C)\C=C\C(=O)NO)C(=O)c1ccc(cc1)N(C)C,RTKIYFITIVXBLE-WKWSCTOISA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,1.11111,µM,1.11111,24.0,True,False,CC(C=CC(=O)NO)=CC(C)C(=O)c1ccc(N(C)C)cc1,RTKIYFITIVXBLE-WKWSCTOISA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209819,REP.A028_YAPC_24H:K09,BRD-K60230970,MG-132,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,TZYWCYJVHRLUCT-VABKMULXSA-N,yapc,cell line,YAPC,-666,pancreas,...,24 h,GSE70138,20.00000,um,20.00000,24.0,True,False,CC(C)CC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=...,TZYWCYJVHRLUCT-VABKMULXSA-N
209820,REP.A028_YAPC_24H:M18,BRD-K96862998,pirfenidone,Cc1ccc(=O)n(c1)-c1ccccc1,ISWRGOKTTBVCFA-UHFFFAOYSA-N,yapc,cell line,YAPC,-666,pancreas,...,24 h,GSE70138,0.04000,um,0.04000,24.0,True,False,Cc1ccc(=O)n(-c2ccccc2)c1,ISWRGOKTTBVCFA-UHFFFAOYSA-N
209821,REP.A028_YAPC_24H:O01,BRD-K60230970,MG-132,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,TZYWCYJVHRLUCT-VABKMULXSA-N,yapc,cell line,YAPC,-666,pancreas,...,24 h,GSE70138,20.00000,um,20.00000,24.0,True,False,CC(C)CC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=...,TZYWCYJVHRLUCT-VABKMULXSA-N
209822,REP.A028_YAPC_24H:O06,BRD-K60230970,MG-132,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,TZYWCYJVHRLUCT-VABKMULXSA-N,yapc,cell line,YAPC,-666,pancreas,...,24 h,GSE70138,20.00000,um,20.00000,24.0,True,False,CC(C)CC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=...,TZYWCYJVHRLUCT-VABKMULXSA-N
