In [10]:
'''
with h5py.File(file_path, "r") as f:
    # Extract IDs
    rid = [x.decode() for x in f["/0/META/ROW/id"][()]]
    cid = [x.decode() for x in f["/0/META/COL/id"][()]]
    print("Genes:", len(rid))      # should be 12328
    print("Samples:", len(cid))    # should be 473647

    # Access the matrix
    mat = f["/0/DATA/0/matrix"]

    # Example: first 100 genes × first 10 samples
    submat = mat[:100, :10]

# Put into DataFrame
df = pd.DataFrame(submat, index=rid[:100], columns=cid[:10])
print(df.shape)

Genes: 12328
Samples: 473647
(100, 10)


In [1]:
import pandas as pd
from pathlib import Path


# 改成你的数据根目录（包含 GSE92742/ 与 GSE70138/ 的这一层）
DATA_ROOT = Path("/ShangGaoAIProjects/Lingge/LINCS/data").resolve()

OUTPUT_DIR = (DATA_ROOT / "Processed_data")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

DATA_ROOT, OUTPUT_DIR

(PosixPath('/ShangGaoAIProjects/Lingge/LINCS/data'),
 PosixPath('/ShangGaoAIProjects/Lingge/LINCS/data/Processed_data'))

In [2]:
for p in (DATA_ROOT / "GSE92742").glob("*"):
    print("92742:", p.name)
for p in (DATA_ROOT / "GSE70138").glob("*"):
    print("70138:", p.name)


92742: GSE92742_Broad_LINCS_cell_info.txt.gz
92742: GSE92742_Broad_LINCS_gene_info.txt.gz
92742: GSE92742_Broad_LINCS_sig_info.txt.gz
92742: GSE92742_Broad_LINCS_pert_info.txt.gz
92742: GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx
92742: GSE92742_Broad_LINCS_inst_info.txt.gz
70138: GSE70138_Broad_LINCS_sig_info_2017_03_06.txt.gz
70138: GSE70138_Broad_LINCS_gene_info_2017_03_06.txt.gz
70138: GSE70138_Broad_LINCS_pert_info_2017_03_06.txt.gz
70138: GSE70138_Broad_LINCS_inst_info_2017_03_06.txt.gz
70138: GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx
70138: GSE70138_Broad_LINCS_cell_info_2017_04_28.txt.gz


In [3]:
GSE92742_cell_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_cell_info.txt.gz')
GSE92742_gene_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_gene_info.txt.gz')
GSE92742_sig_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_sig_info.txt.gz')
GSE92742_inst_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_inst_info.txt.gz')
GSE92742_pert_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_pert_info.txt.gz')

GSE70138_cell_info = pd.read_table(DATA_ROOT / "GSE70138" / 'GSE70138_Broad_LINCS_cell_info_2017_04_28.txt.gz')
GSE70138_gene_info = pd.read_table(DATA_ROOT / "GSE70138" / 'GSE70138_Broad_LINCS_gene_info_2017_03_06.txt.gz')
GSE70138_sig_info = pd.read_table(DATA_ROOT / "GSE70138" / 'GSE70138_Broad_LINCS_sig_info_2017_03_06.txt.gz')
GSE70138_inst_info = pd.read_table(DATA_ROOT / "GSE70138" / 'GSE70138_Broad_LINCS_inst_info_2017_03_06.txt.gz')
GSE70138_pert_info = pd.read_table(DATA_ROOT / "GSE70138" / 'GSE70138_Broad_LINCS_pert_info_2017_03_06.txt.gz')


  GSE92742_sig_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_sig_info.txt.gz')
  GSE92742_inst_info = pd.read_table(DATA_ROOT / "GSE92742" / 'GSE92742_Broad_LINCS_inst_info.txt.gz')


In [5]:
# 把 GSE92742 的 sig_info 和对应的 cell_info / pert_info / inst_info 合并起来
# 先在每个表里只保留关键列，避免重复列名太多

sig1 = GSE92742_sig_info

pert1 = GSE92742_pert_info[[
    c for c in GSE92742_pert_info.columns 
    if c in ["pert_id","pert_iname","canonical_smiles","smiles","inchi_key",
             "moa","moa2","target","target_gene","pcl_id","pcl_name"]
]]

cell1 = GSE92742_cell_info[[
    c for c in GSE92742_cell_info.columns 
    if c in ["cell_id","cell_name","lineage","primary_site","disease",
             "subtype","cell_type","base_cell_id","modification"]
]]


In [6]:
# 同样方式处理 GSE70138：选择关键列并合并为 meta2

sig2 = GSE70138_sig_info

pert2 = GSE70138_pert_info[[
    c for c in GSE70138_pert_info.columns
    if c in ["pert_id","pert_iname","canonical_smiles","smiles","inchi_key",
             "moa","moa2","target","target_gene","pcl_id","pcl_name"]
]]

cell2 = GSE70138_cell_info[[
    c for c in GSE70138_cell_info.columns
    if c in ["cell_id","cell_name","lineage","primary_site","disease",
             "subtype","cell_type","base_cell_id","modification"]
]]

inst2 = GSE70138_inst_info[[
    c for c in GSE70138_inst_info.columns
    if c in ["sig_id","det_plate","det_well","rna_plate","rna_well"]
]]



In [10]:
# 仅用 sig_info × pert_info × cell_info（不并入 inst），重新生成 meta1/meta2，并合并为 meta

# 92742
meta1 = sig1.merge(pert1, on="pert_id", how="left") \
            .merge(cell1, on="cell_id", how="left")
meta1["phase"] = "GSE92742"

# 70138
meta2 = sig2.merge(pert2, on="pert_id", how="left") \
            .merge(cell2, on="cell_id", how="left")
meta2["phase"] = "GSE70138"

# 统一 smiles 列 & cell_id 小写
for m in (meta1, meta2):
    if "smiles" not in m.columns:
        m["smiles"] = pd.NA
    if "canonical_smiles" in m.columns:
        m.loc[m["smiles"].isna(), "smiles"] = m.loc[m["smiles"].isna(), "canonical_smiles"]
    if "cell_id" in m.columns:
        m["cell_id"] = m["cell_id"].astype(str).str.lower()

# 合并两期得到训练用元表
meta = pd.concat([meta1, meta2], axis=0, ignore_index=True)
meta.shape, meta.head(3), meta.columns


((591697, 22),
                 sig_id pert_id pert_iname_x    pert_type cell_id pert_dose  \
 0  AML001_CD34_24H:A05    DMSO         DMSO  ctl_vehicle    cd34       0.1   
 1  AML001_CD34_24H:A06    DMSO         DMSO  ctl_vehicle    cd34       0.1   
 2  AML001_CD34_24H:B05    DMSO         DMSO  ctl_vehicle    cd34       0.1   
 
   pert_dose_unit pert_idose  pert_time pert_time_unit  ... pert_iname_y  \
 0              %      0.1 %       24.0              h  ...         DMSO   
 1              %      0.1 %       24.0              h  ...         DMSO   
 2              %      0.1 %       24.0              h  ...         DMSO   
 
                      inchi_key canonical_smiles cell_type base_cell_id  \
 0  IAZDPXIOMUYVGZ-UHFFFAOYSA-N          CS(=O)C   primary         CD34   
 1  IAZDPXIOMUYVGZ-UHFFFAOYSA-N          CS(=O)C   primary         CD34   
 2  IAZDPXIOMUYVGZ-UHFFFAOYSA-N          CS(=O)C   primary         CD34   
 
   modification primary_site      subtype     phase   smile

In [13]:
# 把重复/冲突的列统一，并整理出一张“整洁版”元表（先不做剂量/时间的数值解析）
# 1) 统一药名列：优先 pert_iname（如无则用 *_x 或 *_y）
meta["pert_iname"] = meta.get("pert_iname", pd.Series(pd.NA, index=meta.index))
if "pert_iname_x" in meta.columns:
    meta["pert_iname"] = meta["pert_iname"].fillna(meta["pert_iname_x"])
if "pert_iname_y" in meta.columns:
    meta["pert_iname"] = meta["pert_iname"].fillna(meta["pert_iname_y"])

# 2) 选择训练需要的核心与增强列（存在才保留）
keep_cols = [
    # 键与结构
    "sig_id","pert_id","pert_iname","smiles","inchi_key",
    # 细胞上下文
    "cell_id","cell_name","cell_type","base_cell_id","modification","primary_site","subtype",
    # 条件（原始字符串，下一步再做数值化）
    "pert_type","pert_dose","pert_dose_unit","pert_idose","pert_time","pert_time_unit","pert_itime",
    # 来源
    "phase"
]
keep_cols = [c for c in keep_cols if c in meta.columns]
meta_tidy = meta[keep_cols].copy()

# 3) 去重（极少数情况下同一 sig 可能重复）
meta_tidy = meta_tidy.drop_duplicates(subset=["sig_id"]).reset_index(drop=True)

meta_tidy.shape, meta_tidy.head(3), meta_tidy.columns


((591697, 19),
                 sig_id pert_id pert_iname   smiles  \
 0  AML001_CD34_24H:A05    DMSO       DMSO  CS(=O)C   
 1  AML001_CD34_24H:A06    DMSO       DMSO  CS(=O)C   
 2  AML001_CD34_24H:B05    DMSO       DMSO  CS(=O)C   
 
                      inchi_key cell_id cell_type base_cell_id modification  \
 0  IAZDPXIOMUYVGZ-UHFFFAOYSA-N    cd34   primary         CD34         -666   
 1  IAZDPXIOMUYVGZ-UHFFFAOYSA-N    cd34   primary         CD34         -666   
 2  IAZDPXIOMUYVGZ-UHFFFAOYSA-N    cd34   primary         CD34         -666   
 
   primary_site      subtype    pert_type pert_dose pert_dose_unit pert_idose  \
 0         bone  bone marrow  ctl_vehicle       0.1              %      0.1 %   
 1         bone  bone marrow  ctl_vehicle       0.1              %      0.1 %   
 2         bone  bone marrow  ctl_vehicle       0.1              %      0.1 %   
 
    pert_time pert_time_unit pert_itime     phase  
 0       24.0              h       24 h  GSE92742  
 1       24.0  

In [9]:
meta_tidy

NameError: name 'meta_tidy' is not defined

In [14]:
# 基于原始列，把“剂量/时间”规范化到新列：dose_value / dose_unit_raw / dose_uM / time_h（向量化写法）

import numpy as np

# ---- 剂量：优先 (pert_dose, pert_dose_unit)，否则回退到 pert_idose（里面常自带单位/数值）----
dose_src = meta_tidy[["pert_dose","pert_dose_unit"]].astype(str).agg(" ".join, axis=1)
if "pert_idose" in meta_tidy.columns:
    mask_no_du = meta_tidy["pert_dose"].isna() & meta_tidy["pert_dose_unit"].isna()
    dose_src.loc[mask_no_du] = meta_tidy.loc[mask_no_du, "pert_idose"].astype(str)

dose_num = pd.to_numeric(
    dose_src.str.extract(r"([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)", expand=False),
    errors="coerce"
)
dose_unit = dose_src.str.extract(r"(uM|µM|μM|um|UM|nM|mM|%|mg/mL|ug/mL)", expand=False)
dose_unit_raw = dose_unit
dose_unit_norm = (
    dose_unit.str.lower()
              .str.replace("μ","u", regex=False)
              .str.replace("µ","u", regex=False)
)

dose_uM = pd.Series(np.nan, index=meta_tidy.index, dtype="float64")
dose_uM.loc[dose_unit_norm.isin(["um","umol","um/l"])] = dose_num.loc[dose_unit_norm.isin(["um","umol","um/l"])]
dose_uM.loc[dose_unit_norm=="nm"] = dose_num.loc[dose_unit_norm=="nm"] / 1000.0
dose_uM.loc[dose_unit_norm=="mm"] = dose_num.loc[dose_unit_norm=="mm"] * 1000.0

meta_tidy["dose_value"]    = dose_num
meta_tidy["dose_unit_raw"] = dose_unit_raw
meta_tidy["dose_uM"]       = dose_uM

# ---- 时间：优先 (pert_time, pert_time_unit)，否则回退到 pert_itime ----
time_src = meta_tidy[["pert_time","pert_time_unit"]].astype(str).agg(" ".join, axis=1)
if "pert_itime" in meta_tidy.columns:
    mask_no_tu = meta_tidy["pert_time"].isna() & meta_tidy["pert_time_unit"].isna()
    time_src.loc[mask_no_tu] = meta_tidy.loc[mask_no_tu, "pert_itime"].astype(str)

time_num = pd.to_numeric(
    time_src.str.extract(r"([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)", expand=False),
    errors="coerce"
)
time_unit = time_src.str.extract(r"(hour|hr|h|day|d)", expand=False).str.lower()

# 默认小时；若单位是天则 ×24
time_h = time_num.copy()
time_h = time_h.where(time_unit.notna(), time_num)                      # 无单位 → 按小时
time_h = time_h.where(~time_unit.isin(["d","day"]), time_num * 24.0)    # 天 → 小时

meta_tidy["time_h"] = time_h

meta_tidy.shape, meta_tidy.head(3), meta_tidy.columns


((591697, 23),
                 sig_id pert_id pert_iname   smiles  \
 0  AML001_CD34_24H:A05    DMSO       DMSO  CS(=O)C   
 1  AML001_CD34_24H:A06    DMSO       DMSO  CS(=O)C   
 2  AML001_CD34_24H:B05    DMSO       DMSO  CS(=O)C   
 
                      inchi_key cell_id cell_type base_cell_id modification  \
 0  IAZDPXIOMUYVGZ-UHFFFAOYSA-N    cd34   primary         CD34         -666   
 1  IAZDPXIOMUYVGZ-UHFFFAOYSA-N    cd34   primary         CD34         -666   
 2  IAZDPXIOMUYVGZ-UHFFFAOYSA-N    cd34   primary         CD34         -666   
 
   primary_site  ... pert_dose_unit pert_idose pert_time pert_time_unit  \
 0         bone  ...              %      0.1 %      24.0              h   
 1         bone  ...              %      0.1 %      24.0              h   
 2         bone  ...              %      0.1 %      24.0              h   
 
   pert_itime     phase dose_value dose_unit_raw dose_uM  time_h  
 0       24 h  GSE92742        0.1             %     NaN    24.0  
 1       

In [25]:
# 标注训练常用的标签：小分子/控制；并做一点点清理（cell_id 小写、缺失 smiles 占位）
# 小分子：pert_type == "trt_cp"
meta_tidy["pert_type"] = meta_tidy.get("pert_type", pd.Series([pd.NA]*len(meta_tidy)))
meta_tidy["is_small_molecule"] = meta_tidy["pert_type"].astype(str).str.lower().eq("trt_cp")

# 控制样本：名字或类型里有典型控制
pi = meta_tidy.get("pert_iname", pd.Series([""]*len(meta_tidy))).astype(str).str.upper()
pt = meta_tidy["pert_type"].astype(str).str.lower()
meta_tidy["is_control"] = (pi.isin(["DMSO","CTL_VEHICLE","CTL_UNTRT"])) | (pt.str.startswith("ctl"))

# 清理：cell_id 统一小写；smiles 若缺先占位（后续可用外部表再补）
if "cell_id" in meta_tidy.columns:
    meta_tidy["cell_id"] = meta_tidy["cell_id"].astype(str).str.lower()
if "smiles" in meta_tidy.columns:
    meta_tidy["smiles"] = meta_tidy["smiles"].astype(object)

# 快速查看标签分布
meta_tidy["is_small_molecule"].value_counts(dropna=False), meta_tidy["is_control"].value_counts(dropna=False)


(is_small_molecule
 True     312438
 False    279259
 Name: count, dtype: int64,
 is_control
 False    562868
 True      28829
 Name: count, dtype: int64)

In [50]:
'''
1. pert_type == "trt_cp"   （或 is_small_molecule == True）
2. is_control == False      （或 pert_type 不以 "ctl_" 开头）
3. smiles notna & smiles != "-666"
4. inchi_key not na & inchi_key != "-666"
5. 23.5 <= time_h <= 24.5
6. dose_uM notna & dose_uM > 0
'''

meta_tidy_small_molecule = meta_tidy[meta_tidy['is_small_molecule']]

meta_tidy_small_molecule = meta_tidy_small_molecule[
    ~((meta_tidy_small_molecule["time_h"].isna()) |
    (meta_tidy_small_molecule["time_h"] == "-666"))
]
print(meta_tidy_small_molecule["time_h"].value_counts())

meta_tidy_small_molecule = meta_tidy_small_molecule[meta_tidy_small_molecule["time_h"] == 24]
print(meta_tidy_small_molecule["time_h"].value_counts())

meta_tidy_small_molecule = meta_tidy_small_molecule[
    (~(meta_tidy_small_molecule["dose_uM"].isna())) &
    (meta_tidy_small_molecule["dose_uM"] > 0)
]
print(meta_tidy_small_molecule["dose_uM"].value_counts())

meta_tidy_small_molecule = meta_tidy_small_molecule[
    ~((meta_tidy_small_molecule["smiles"].isna()) |
    (meta_tidy_small_molecule["smiles"] == "-666"))
]
print(meta_tidy_small_molecule["smiles"].value_counts())


time_h
24.0     211081
6.0       96348
3.0        4950
48.0         58
144.0         1
Name: count, dtype: int64
time_h
24.0    211081
Name: count, dtype: int64
dose_uM
10.0000    63013
1.1100     17201
0.3700     17133
3.3300     17123
0.0400     16933
           ...  
9.5131         1
3.1710         1
1.0570         1
0.3523         1
3.3507         1
Name: count, Length: 2741, dtype: int64
smiles
CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(C)C)NC(=O)OCc1ccccc1)C=O               2348
CC(C)C[C@@H](NC(=O)[C@@H](Cc1ccccc1)NC(=O)C1=CNC=CN1)B(O)O                           2315
ONC(=O)CCCCCCC(=O)Nc1ccccc1                                                           963
COC1CC(C)CC2=C(OC)C(=O)C=C(NC(=O)\C(C)=C\C=C\C(OC)C(OC(N)=O)\C(C)=C\C(C)C1O)C2=O      709
OC[C@@H](O)CONC(=O)c1ccc(F)c(F)c1Nc1ccc(I)cc1F                                        611
                                                                                     ... 
CCC(=O)N1[C@H]([C@H](CO)[C@H]2Cn3c(ccc(C4=CCCCC4)c3=O)[C@

In [53]:
# 找到数值类字段
numeric_cols = ["pert_dose", "dose_value", "dose_uM", "time_h"]

for col in numeric_cols:
    if col in meta_tidy_small_molecule.columns:
        meta_tidy_small_molecule[col] = pd.to_numeric(
            meta_tidy_small_molecule[col], errors="coerce"
        )


In [54]:
meta_tidy_small_molecule.to_parquet(OUTPUT_DIR / "l1000_signatures_metadata.parquet", index=False)
meta_tidy_small_molecule.to_csv(OUTPUT_DIR / "l1000_signatures_metadata.csv", index=False)

In [57]:
meta_tidy_small_molecule

Unnamed: 0,sig_id,pert_id,pert_iname,smiles,inchi_key,cell_id,cell_type,base_cell_id,modification,primary_site,...,pert_time,pert_time_unit,pert_itime,phase,dose_value,dose_unit_raw,dose_uM,time_h,is_small_molecule,is_control
4,AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24.0,h,24 h,GSE92742,0.37037,µM,0.37037,24.0,True,False
5,AML001_CD34_24H:BRD-A03772856:1.11111,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24.0,h,24 h,GSE92742,1.11111,µM,1.11111,24.0,True,False
6,AML001_CD34_24H:BRD-A03772856:10,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24.0,h,24 h,GSE92742,10.00000,µM,10.00000,24.0,True,False
7,AML001_CD34_24H:BRD-A03772856:3.33333,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24.0,h,24 h,GSE92742,3.33333,µM,3.33333,24.0,True,False
8,AML001_CD34_24H:BRD-A19037878:1.11111,BRD-A19037878,trichostatin-a,CC(\C=C(C)\C=C\C(=O)NO)C(=O)c1ccc(cc1)N(C)C,RTKIYFITIVXBLE-WKWSCTOISA-N,cd34,primary,CD34,-666,bone,...,24.0,h,24 h,GSE92742,1.11111,µM,1.11111,24.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587513,REP.A028_YAPC_24H:K09,BRD-K60230970,MG-132,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,TZYWCYJVHRLUCT-VABKMULXSA-N,yapc,cell line,YAPC,-666,pancreas,...,,,24 h,GSE70138,20.00000,um,20.00000,24.0,True,False
587514,REP.A028_YAPC_24H:M18,BRD-K96862998,pirfenidone,Cc1ccc(=O)n(c1)-c1ccccc1,ISWRGOKTTBVCFA-UHFFFAOYSA-N,yapc,cell line,YAPC,-666,pancreas,...,,,24 h,GSE70138,0.04000,um,0.04000,24.0,True,False
587515,REP.A028_YAPC_24H:O01,BRD-K60230970,MG-132,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,TZYWCYJVHRLUCT-VABKMULXSA-N,yapc,cell line,YAPC,-666,pancreas,...,,,24 h,GSE70138,20.00000,um,20.00000,24.0,True,False
587516,REP.A028_YAPC_24H:O06,BRD-K60230970,MG-132,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,TZYWCYJVHRLUCT-VABKMULXSA-N,yapc,cell line,YAPC,-666,pancreas,...,,,24 h,GSE70138,20.00000,um,20.00000,24.0,True,False


In [56]:
print(meta_tidy_small_molecule.dtypes)

# 检查 object 列里是不是本应为数值
for col in meta_tidy_small_molecule.select_dtypes(include="object").columns:
    print(col, meta_tidy_small_molecule[col].unique()[:5])


sig_id                object
pert_id               object
pert_iname            object
smiles                object
inchi_key             object
cell_id               object
cell_type             object
base_cell_id          object
modification          object
primary_site          object
subtype               object
pert_type             object
pert_dose            float64
pert_dose_unit        object
pert_idose            object
pert_time            float64
pert_time_unit        object
pert_itime            object
phase                 object
dose_value           float64
dose_unit_raw         object
dose_uM              float64
time_h               float64
is_small_molecule       bool
is_control              bool
dtype: object
sig_id ['AML001_CD34_24H:BRD-A03772856:0.37037'
 'AML001_CD34_24H:BRD-A03772856:1.11111'
 'AML001_CD34_24H:BRD-A03772856:10'
 'AML001_CD34_24H:BRD-A03772856:3.33333'
 'AML001_CD34_24H:BRD-A19037878:1.11111']
pert_id ['BRD-A03772856' 'BRD-A19037878' 'BRD-A195002

In [58]:
from rdkit import rdBase
from rdkit import Chem
import unicodedata, re

print("RDKit:", rdBase.rdkitVersion)
print(meta_tidy_small_molecule["smiles"].dtype, len(meta_tidy_small_molecule))

# 1) 看看样例
print(meta_tidy_small_molecule["smiles"].head(10).tolist())

# 2) 统计可疑值
s = meta_tidy_small_molecule["smiles"].astype(str)
print("空/NA 计数:", s.isna().sum(), (s=="").sum())
print("包含 -666:", (s.str.strip()=="-666").sum())

# 3) 非法字符检测（只允许 SMILES 合法字符的大致集合）
pat = re.compile(r"^[A-Za-z0-9@\+\-\[\]\(\)=#\\/%.:\*]+$")  # 含常见分隔 '.'、芳香、手性等
mask_bad_charset = ~s.fillna("").map(lambda x: bool(pat.match(x.strip())))
print("疑似含异常字符的行:", mask_bad_charset.sum())
print(s[mask_bad_charset].head(10).tolist())

RDKit: 2022.09.5
object 209824
['COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4', 'COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4', 'COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4', 'COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4', 'CC(\\C=C(C)\\C=C\\C(=O)NO)C(=O)c1ccc(cc1)N(C)C', 'CC(\\C=C(C)\\C=C\\C(=O)NO)C(=O)c1ccc(cc1)N(C)C', 'CC(\\C=C(C)\\C=C\\C(=O)NO)C(=O)c1ccc(cc1)N(C)C', 'COC1CC(C)CC2=C(OC)C(=O)C=C(NC(=O)\\C(C)=C\\C=C\\C(OC)C(OC(N)=O)\\C(C)=C\\C(C)C1O)C2=O', 'COC1CC(C)CC2=C(OC)C(=O)C=C(NC(=O)\\C(C)=C\\C=C\\C(OC)C(OC(N)=O)\\C(C)=C\\C(C)C1O)C2=O', 'COC1CC(C)CC2=C(OC)C(=O)C=C(NC(=O)\\C(C)=C\\C=C\\C(OC)C(OC(N)=O)\\C(C)=C\\C(C)C1O)C2=O']
空/NA 计数: 0 0
包含 -666: 0
疑似含异常字符的行: 0
[]


In [61]:
counts = meta_tidy_small_molecule["dose_uM"]
value_range = (counts.min(), counts.max())

In [62]:
value_range

(0.0003, 177.6)