In [1]:
import os
import h5py
import numpy as np
import pandas as pd

# ==== 路径 ====
BASE = "/ShangGaoAIProjects/Lingge/LINCS/data"
GCTX_PHASE1 = f"{BASE}/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"
GCTX_PHASE2 = f"{BASE}/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx"

# 把抽样预览/landmark列表存到你已有的 Processed_data 目录里
OUT_DIR = f"{BASE}/Processed_data"
os.makedirs(OUT_DIR, exist_ok=True)

def peek_gctx(gctx_path, head_rows=5, head_cols=5, sample_cols=2000, random_state=42):
    """
    只窥视 gctx：读基本形状、row/col ids、部分行列的切片、小规模随机抽样列的统计信息，
    并尝试读取 pr_is_lm 等 row meta 字段以统计 landmark 基因数量。
    """
    print("="*80)
    print(f"[OPEN] {gctx_path}")
    with h5py.File(gctx_path, "r") as f:
        # 标准层级：/0/DATA/0/matrix, /0/META/ROW, /0/META/COL
        mat = f["0"]["DATA"]["0"]["matrix"]
        shape = mat.shape  # (n_rows, n_cols) = (genes, signatures)
        print(f"Matrix shape: rows(genes)={shape[0]}, cols(signatures)={shape[1]}")

        # 读取行列 ID（字节 -> 字符串）
        row_ids = [rid.decode("utf-8") for rid in f["0"]["META"]["ROW"]["id"][:]]
        col_ids = [cid.decode("utf-8") for cid in f["0"]["META"]["COL"]["id"][:]]

        print(f"Row id sample (first 5): {row_ids[:5]}")
        print(f"Col id sample (first 5): {col_ids[:5]}")

        # 行元数据里常见的字段（不同版本可能略有不同）
        row_meta_grp = f["0"]["META"]["ROW"]
        def read_opt(name):
            return row_meta_grp[name][:] if name in row_meta_grp else None

        pr_is_lm = read_opt("pr_is_lm")  # Landmark 标记（0/1）
        pr_gene_id = read_opt("pr_gene_id")
        pr_gene_symbol = read_opt("pr_gene_symbol")

        # 处理 pr_is_lm
        n_lm = None
        if pr_is_lm is not None:
            # 可能是字节/字符串，统一转 int
            try:
                pr_is_lm = np.array([int(x) for x in pr_is_lm])
            except Exception:
                # 有些文件可能是字节 -> 先解码再转
                pr_is_lm = np.array([int(x.decode("utf-8")) for x in pr_is_lm])
            n_lm = int(pr_is_lm.sum())
            print(f"Detected landmark genes (pr_is_lm==1): {n_lm}")

        # 读取一个很小的 “左上角” 预览切片
        r_slice = slice(0, min(head_rows, shape[0]))
        c_slice = slice(0, min(head_cols, shape[1]))
        corner = mat[r_slice, c_slice]  # 这是一个 numpy 数组

        corner_df = pd.DataFrame(
            corner,
            index=row_ids[r_slice],
            columns=col_ids[c_slice]
        )
        print("\nTop-left preview:")
        display(corner_df)

        # 随机抽样一小部分列（例如 2000 列）做一些统计，避免全量读入内存
        rng = np.random.default_rng(random_state)
        n_cols = shape[1]
        n_take = min(sample_cols, n_cols)
        sample_col_idx = np.sort(rng.choice(n_cols, size=n_take, replace=False))
        sample_mat = mat[:, sample_col_idx]  # 仍在 HDF5 上切片，不会一下子爆内存

        # 计算每列的均值/方差（基因维度上）
        col_means = np.asarray(sample_mat.mean(axis=0)).ravel()
        col_vars = np.asarray(sample_mat.var(axis=0)).ravel()

        print(f"\nColumn stats on a sample of {n_take} signatures:")
        print(f"  mean of means = {col_means.mean():.4f}, std of means = {col_means.std():.4f}")
        print(f"  mean of variances = {col_vars.mean():.4f}, std of variances = {col_vars.std():.4f}")

        # 如果有 landmark 标记，导出 landmark 基因列表
        lm_path_csv, lm_path_parquet = None, None
        if n_lm is not None:
            lm_mask = pr_is_lm == 1
            # gene_symbol / gene_id 可能为空，尽量构建一个表
            df_lm = pd.DataFrame({
                "rid": row_ids,
                "is_landmark": lm_mask.astype(int),
                "pr_gene_symbol": [x.decode("utf-8") if isinstance(x, bytes) else (x if x is not None else None)
                                   for x in (pr_gene_symbol if pr_gene_symbol is not None else [None]*shape[0])],
                "pr_gene_id": [int(x) if (x is not None and not isinstance(x, (bytes, np.bytes_))) 
                               else (int(x.decode('utf-8')) if x is not None else None)
                               for x in (pr_gene_id if pr_gene_id is not None else [None]*shape[0])]
            })
            df_lm = df_lm[df_lm["is_landmark"] == 1].reset_index(drop=True)

            # 保存（带数据集名区分）
            tag = os.path.basename(gctx_path).split("_")[0]  # GSE92742 or GSE70138
            lm_path_csv = os.path.join(OUT_DIR, f"{tag}_landmark_genes.csv")
            lm_path_parquet = os.path.join(OUT_DIR, f"{tag}_landmark_genes.parquet")
            df_lm.to_csv(lm_path_csv, index=False)
            df_lm.to_parquet(lm_path_parquet, index=False)
            print(f"\nSaved landmark list: {lm_path_csv}")
            print(f"Saved landmark list: {lm_path_parquet}")

        # 也把预览角落保存下来，方便对照
        corner_path = os.path.join(OUT_DIR, f"{os.path.basename(gctx_path)}.corner_{head_rows}x_{head_cols}.csv")
        corner_df.to_csv(corner_path)
        print(f"Saved matrix corner preview: {corner_path}")

        # 返回一些可复用的信息
        return {
            "shape": shape,
            "row_ids": row_ids,
            "col_ids": col_ids,
            "landmark_count": n_lm,
            "corner_path": corner_path,
        }

# ============== 实际执行：分别窥视 Phase I & Phase II ==============
res1 = peek_gctx(GCTX_PHASE1, head_rows=5, head_cols=5, sample_cols=2000)
res2 = peek_gctx(GCTX_PHASE2, head_rows=5, head_cols=5, sample_cols=2000)

print("\n=== Summary ===")
for tag, res in [("GSE92742 (Phase I)", res1), ("GSE70138 (Phase II)", res2)]:
    shp = res["shape"]
    print(f"{tag}: genes={shp[0]}, signatures={shp[1]}, landmark_count={res['landmark_count']}")


[OPEN] /ShangGaoAIProjects/Lingge/LINCS/data/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx
Matrix shape: rows(genes)=473647, cols(signatures)=12328
Row id sample (first 5): ['5720', '466', '6009', '2309', '387']
Col id sample (first 5): ['CPC005_A375_6H:BRD-A85280935-003-01-7:10', 'CPC005_A375_6H:BRD-A07824748-001-02-6:10', 'CPC004_A375_6H:BRD-K20482099-001-01-1:10', 'CPC005_A375_6H:BRD-K62929068-001-03-3:10', 'CPC005_A375_6H:BRD-K43405658-001-01-8:10']

Top-left preview:


Unnamed: 0,CPC005_A375_6H:BRD-A85280935-003-01-7:10,CPC005_A375_6H:BRD-A07824748-001-02-6:10,CPC004_A375_6H:BRD-K20482099-001-01-1:10,CPC005_A375_6H:BRD-K62929068-001-03-3:10,CPC005_A375_6H:BRD-K43405658-001-01-8:10
5720,0.773769,-0.818468,0.189572,-0.146031,-0.654002
466,-0.645586,-0.810749,0.45906,-0.224676,-0.335681
6009,-5.449666,2.393775,1.27979,2.167868,2.333199
2309,0.193408,-0.582243,-0.178977,-1.182025,-1.012651
387,1.006298,0.455536,0.631738,-0.936414,-1.213203



Column stats on a sample of 2000 signatures:
  mean of means = 0.0218, std of means = 0.0704
  mean of variances = 0.6162, std of variances = 0.1894
Saved matrix corner preview: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx.corner_5x_5.csv
[OPEN] /ShangGaoAIProjects/Lingge/LINCS/data/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx
Matrix shape: rows(genes)=118050, cols(signatures)=12328
Row id sample (first 5): ['780', '7849', '2978', '2049', '2101']
Col id sample (first 5): ['REP.A001_A375_24H:A03', 'REP.A001_A375_24H:A04', 'REP.A001_A375_24H:A05', 'REP.A001_A375_24H:A06', 'REP.A001_A375_24H:A07']

Top-left preview:


Unnamed: 0,REP.A001_A375_24H:A03,REP.A001_A375_24H:A04,REP.A001_A375_24H:A05,REP.A001_A375_24H:A06,REP.A001_A375_24H:A07
780,4.264143,0.057249,-1.01248,0.308898,-0.10407
7849,-0.382211,0.304313,-0.674992,-0.335931,0.324702
2978,-0.571711,-0.754999,0.414515,-0.502323,0.495425
2049,0.584376,-0.589973,-0.227603,-1.775247,-0.107543
2101,0.658348,-0.226854,0.287899,-0.666601,-0.091924



Column stats on a sample of 2000 signatures:
  mean of means = 0.0413, std of means = 0.1096
  mean of variances = 0.8184, std of variances = 0.5090
Saved matrix corner preview: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx.corner_5x_5.csv

=== Summary ===
GSE92742 (Phase I): genes=473647, signatures=12328, landmark_count=None
GSE70138 (Phase II): genes=118050, signatures=12328, landmark_count=None


In [14]:
import os
import json
import h5py
import numpy as np
import pandas as pd

BASE = "/ShangGaoAIProjects/Lingge/LINCS/data"
META  = f"{BASE}/Processed_data"
OUT  = f"{BASE}/Processed_data/L1000gctx_process"
os.makedirs(OUT, exist_ok=True)

GCTX_P1 = f"{BASE}/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"   # Phase I
GCTX_P2 = f"{BASE}/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx"  # Phase II

# 你已经处理好的 meta
SIGMETA_ALL_PARQUET = f"{OUT}/l1000_signatures_metadata.parquet"
SIGMETA_CANON_PARQUET = f"{OUT}/l1000_signatures_metadata_canonical.parquet"


In [2]:
# 读取 Phase I 的行列 id 与矩阵维度
with h5py.File(GCTX_P1, "r") as f1:
    mat1 = f1["0"]["DATA"]["0"]["matrix"]
    shape1 = tuple(mat1.shape)
    row_ids1 = [x.decode("utf-8") if isinstance(x, (bytes, np.bytes_)) else str(x) for x in f1["0"]["META"]["ROW"]["id"][:]]
    col_ids1 = [x.decode("utf-8") if isinstance(x, (bytes, np.bytes_)) else str(x) for x in f1["0"]["META"]["COL"]["id"][:]]

# 读取 Phase II
with h5py.File(GCTX_P2, "r") as f2:
    mat2 = f2["0"]["DATA"]["0"]["matrix"]
    shape2 = tuple(mat2.shape)
    row_ids2 = [x.decode("utf-8") if isinstance(x, (bytes, np.bytes_)) else str(x) for x in f2["0"]["META"]["ROW"]["id"][:]]
    col_ids2 = [x.decode("utf-8") if isinstance(x, (bytes, np.bytes_)) else str(x) for x in f2["0"]["META"]["COL"]["id"][:]]

print("Phase I  shape (raw):", shape1)
print("Phase II shape (raw):", shape2)
print("\nPhase I  ROW first 3:", row_ids1[:3])
print("Phase I  COL first 3:", col_ids1[:3])
print("Phase II ROW first 3:", row_ids2[:3])
print("Phase II COL first 3:", col_ids2[:3])


Phase I  shape (raw): (473647, 12328)
Phase II shape (raw): (118050, 12328)

Phase I  ROW first 3: ['5720', '466', '6009']
Phase I  COL first 3: ['CPC005_A375_6H:BRD-A85280935-003-01-7:10', 'CPC005_A375_6H:BRD-A07824748-001-02-6:10', 'CPC004_A375_6H:BRD-K20482099-001-01-1:10']
Phase II ROW first 3: ['780', '7849', '2978']
Phase II COL first 3: ['REP.A001_A375_24H:A03', 'REP.A001_A375_24H:A04', 'REP.A001_A375_24H:A05']


In [7]:
# Phase I
gene_ids1 = row_ids1                 # 行 = 基因
sig_ids1  = col_ids1                 # 列 = 签名
print("Phase I: genes =", len(gene_ids1), "| signatures =", len(sig_ids1))

# Phase II
gene_ids2 = row_ids2
sig_ids2  = col_ids2
print("Phase II: genes =", len(gene_ids2), "| signatures =", len(sig_ids2))

# 简单 sanity check：col 数应为 12328（文件名指示），行数应为 ~10^5（签名数）
print("Sanity check: shape1 =", shape1, "| shape2 =", shape2)


Phase I: genes = 12328 | signatures = 473647
Phase II: genes = 12328 | signatures = 118050
Sanity check: shape1 = (473647, 12328) | shape2 = (118050, 12328)


In [9]:
import numpy as np
import pandas as pd
import os

# Phase I 索引
gidx1 = pd.DataFrame({
    "gene_id": gene_ids1,
    "pos": np.arange(len(gene_ids1), dtype=np.int32),
    "dataset": "GSE92742"
})
sidx1 = pd.DataFrame({
    "sig_id": sig_ids1,
    "pos": np.arange(len(sig_ids1), dtype=np.int32),
    "dataset": "GSE92742"
})

# Phase II 索引
gidx2 = pd.DataFrame({
    "gene_id": gene_ids2,
    "pos": np.arange(len(gene_ids2), dtype=np.int32),
    "dataset": "GSE70138"
})
sidx2 = pd.DataFrame({
    "sig_id": sig_ids2,
    "pos": np.arange(len(sig_ids2), dtype=np.int32),
    "dataset": "GSE70138"
})

gidx1_path = f"{OUT}/GSE92742_gene_index.parquet"
gidx2_path = f"{OUT}/GSE70138_gene_index.parquet"
sidx1_path = f"{OUT}/GSE92742_sig_index.parquet"
sidx2_path = f"{OUT}/GSE70138_sig_index.parquet"

gidx1.to_parquet(gidx1_path, index=False)
gidx2.to_parquet(gidx2_path, index=False)
sidx1.to_parquet(sidx1_path, index=False)
sidx2.to_parquet(sidx2_path, index=False)

print("Saved index files:")
print(" ", gidx1_path)
print(" ", gidx2_path)
print(" ", sidx1_path)
print(" ", sidx2_path)


Saved index files:
  /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/GSE92742_gene_index.parquet
  /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/GSE70138_gene_index.parquet
  /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/GSE92742_sig_index.parquet
  /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/GSE70138_sig_index.parquet


In [12]:
# 并集表：记录两期各自的行位置
gene_union = gidx1.merge(
    gidx2, on="gene_id", how="outer", suffixes=("_92742", "_70138"), indicator=True
)
u_path = f"{OUT}/gene_index_union.parquet"
gene_union.to_parquet(u_path, index=False)

# 统计集合关系
n1 = len(gidx1)
n2 = len(gidx2)
n_inter = (gene_union["_merge"] == "both").sum()
n_only_1 = (gene_union["_merge"] == "left_only").sum()
n_only_2 = (gene_union["_merge"] == "right_only").sum()

# 在交集上比较顺序（Spearman，用 rank+Pearson 实现，不依赖 scipy）
common = gene_union[gene_union["_merge"] == "both"].dropna(subset=["pos_92742","pos_70138"]).copy()
if len(common) >= 2:
    rho = common["pos_92742"].rank().corr(common["pos_70138"].rank(), method="pearson")
else:
    rho = np.nan

print("Gene set consistency:")
print(f"  Phase I genes: {n1}")
print(f"  Phase II genes: {n2}")
print(f"  Intersection : {n_inter}")
print(f"  Only Phase I : {n_only_1}")
print(f"  Only Phase II: {n_only_2}")
print(f"  Spearman rho on common gene order: {rho:.6f}" if pd.notna(rho) else "  Spearman rho: NA")
print("Union index saved to:", u_path)


Gene set consistency:
  Phase I genes: 12328
  Phase II genes: 12328
  Intersection : 12328
  Only Phase I : 0
  Only Phase II: 0
  Spearman rho on common gene order: 0.614592
Union index saved to: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/gene_index_union.parquet


In [16]:
# 读取你已经处理好的签名 meta（优先 parquet，如不存在再尝试 csv）
sigmeta_all_path    = f"{META}/l1000_signatures_metadata.parquet"
sigmeta_canon_path  = f"{META}/l1000_signatures_metadata_canonical.parquet"

if os.path.exists(sigmeta_all_path):
    try:
        sigmeta_all = pd.read_parquet(sigmeta_all_path, columns=["sig_id"])
    except Exception:
        sigmeta_all = None
else:
    csv_guess = os.path.splitext(sigmeta_all_path)[0] + ".csv"
    sigmeta_all = pd.read_csv(csv_guess, usecols=["sig_id"]) if os.path.exists(csv_guess) else None

if os.path.exists(sigmeta_canon_path):
    try:
        sigmeta_canon = pd.read_parquet(sigmeta_canon_path, columns=["sig_id"])
    except Exception:
        sigmeta_canon = None
else:
    csv_guess = os.path.splitext(sigmeta_canon_path)[0] + ".csv"
    sigmeta_canon = pd.read_csv(csv_guess, usecols=["sig_id"]) if os.path.exists(csv_guess) else None

def calc_join_rate(sig_index_df, sigmeta_df):
    if sigmeta_df is None:
        return None
    merged = sig_index_df.merge(sigmeta_df, on="sig_id", how="left", indicator=True)
    return float((merged["_merge"] == "both").mean())

rates = {
    "GSE92742": {
        "join_rate_all": calc_join_rate(sidx1, sigmeta_all),
        "join_rate_canonical": calc_join_rate(sidx1, sigmeta_canon)
    },
    "GSE70138": {
        "join_rate_all": calc_join_rate(sidx2, sigmeta_all),
        "join_rate_canonical": calc_join_rate(sidx2, sigmeta_canon)
    }
}

print("Signature–meta alignment rates:")
for ds, v in rates.items():
    print(f" {ds}: all={v['join_rate_all']}, canonical={v['join_rate_canonical']}")


Signature–meta alignment rates:
 GSE92742: all=0.2290714392786189, canonical=0.22882653115083595
 GSE70138: all=0.8583227445997459, canonical=0.8568996188055908


In [17]:
report = {
    "phase": {
        "GSE92742": {
            "gctx": GCTX_P1,
            "matrix_shape_raw": shape1,
            "n_genes": len(gene_ids1),
            "n_signatures": len(sig_ids1),
            "gene_index_path": f"{OUT}/GSE92742_gene_index.parquet",
            "sig_index_path":  f"{OUT}/GSE92742_sig_index.parquet",
        },
        "GSE70138": {
            "gctx": GCTX_P2,
            "matrix_shape_raw": shape2,
            "n_genes": len(gene_ids2),
            "n_signatures": len(sig_ids2),
            "gene_index_path": f"{OUT}/GSE70138_gene_index.parquet",
            "sig_index_path":  f"{OUT}/GSE70138_sig_index.parquet",
        }
    },
    "gene_set_consistency": {
        "n_genes_phase1": int(n1),
        "n_genes_phase2": int(n2),
        "n_intersection": int(n_inter),
        "n_only_phase1": int(n_only_1),
        "n_only_phase2": int(n_only_2),
        "spearman_rho_common_order": None if pd.isna(rho) else float(rho),
        "union_index_path": f"{OUT}/gene_index_union.parquet"
    },
    "signature_meta_alignment": rates,
    "notes": [
        "已固定：row=genes, col=signatures；不再使用启发式判断。",
        "此处仅完成索引体检；未抽取标签（978/12328）或连到化合物/细胞/时间/剂量 meta。",
        "如需将 Phase II 重排至 Phase I 顺序，可用 gene_index_union.parquet 的 pos_92742/pos_70138 构建映射。"
    ]
}

report_path = f"{OUT}/consistency_report.json"
with open(report_path, "w") as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print("Report saved to:", report_path)


Report saved to: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/consistency_report.json


In [18]:
import os
import pandas as pd

BASE = "/ShangGaoAIProjects/Lingge/LINCS/data/Processed_data"

# 优先用 canonical（如果你是“最终筛过的签名”，应该就是这份）
meta_path_candidates = [
    f"{BASE}/l1000_signatures_metadata_canonical.parquet",
    f"{BASE}/l1000_signatures_metadata.parquet",
    f"{BASE}/l1000_signatures_metadata.csv",
]

meta_path = None
for p in meta_path_candidates:
    if os.path.exists(p):
        meta_path = p
        break

print("Using meta:", meta_path)

# 读取（自动识别 parquet/csv）
if meta_path.endswith(".parquet"):
    meta = pd.read_parquet(meta_path)
elif meta_path.endswith(".csv"):
    meta = pd.read_csv(meta_path)
else:
    raise FileNotFoundError("找不到 meta 文件，请检查上述路径。")

print("meta shape:", meta.shape)
print("meta columns:", list(meta.columns))
display(meta.head(3))


Using meta: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/l1000_signatures_metadata_canonical.parquet
meta shape: (209540, 27)
meta columns: ['sig_id', 'pert_id', 'pert_iname', 'smiles', 'inchi_key', 'cell_id', 'cell_type', 'base_cell_id', 'modification', 'primary_site', 'subtype', 'pert_type', 'pert_dose', 'pert_dose_unit', 'pert_idose', 'pert_time', 'pert_time_unit', 'pert_itime', 'phase', 'dose_value', 'dose_unit_raw', 'dose_uM', 'time_h', 'is_small_molecule', 'is_control', 'smiles_canonical', 'compound_id']


Unnamed: 0,sig_id,pert_id,pert_iname,smiles,inchi_key,cell_id,cell_type,base_cell_id,modification,primary_site,...,pert_itime,phase,dose_value,dose_unit_raw,dose_uM,time_h,is_small_molecule,is_control,smiles_canonical,compound_id
0,AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,0.37037,µM,0.37037,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
1,AML001_CD34_24H:BRD-A03772856:1.11111,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,1.11111,µM,1.11111,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
2,AML001_CD34_24H:BRD-A03772856:10,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,10.0,µM,10.0,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N


In [20]:
META

'/ShangGaoAIProjects/Lingge/LINCS/data/Processed_data'

In [21]:
import os, pandas as pd, numpy as np


# 两期 gctx 的签名索引（你之前已经导出过）
sidx1 = pd.read_parquet(f"{OUT}/GSE92742_sig_index.parquet")   # cols: sig_id, pos, dataset
sidx2 = pd.read_parquet(f"{OUT}/GSE70138_sig_index.parquet")

# 这次要用的 meta（你刚确认的是 canonical）
META_PATH = f"{META}/l1000_signatures_metadata_canonical.parquet"
meta = pd.read_parquet(META_PATH)

print("sidx1:", sidx1.shape, "sidx2:", sidx2.shape)
print("meta :", meta.shape)
meta.head(3)


sidx1: (473647, 3) sidx2: (118050, 3)
meta : (209540, 27)


Unnamed: 0,sig_id,pert_id,pert_iname,smiles,inchi_key,cell_id,cell_type,base_cell_id,modification,primary_site,...,pert_itime,phase,dose_value,dose_unit_raw,dose_uM,time_h,is_small_molecule,is_control,smiles_canonical,compound_id
0,AML001_CD34_24H:BRD-A03772856:0.37037,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,0.37037,µM,0.37037,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
1,AML001_CD34_24H:BRD-A03772856:1.11111,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,1.11111,µM,1.11111,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N
2,AML001_CD34_24H:BRD-A03772856:10,BRD-A03772856,BRD-A03772856,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,YSPMFQJSWDVJME-UHFFFAOYSA-N,cd34,primary,CD34,-666,bone,...,24 h,GSE92742,10.0,µM,10.0,24.0,True,False,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N


In [23]:
# 假设你已经在内存里有：
# sidx1, sidx2, meta

# 先看 phase 分布
print("Phase counts in meta:")
print(meta["phase"].value_counts(dropna=False))

# Phase I 连接
meta_p1 = meta[meta["phase"] == "GSE92742"].copy()
p1_join = meta_p1.merge(sidx1, on="sig_id", how="left", validate="m:1", indicator=True)
p1_hit = float((p1_join["_merge"] == "both").mean())
print(f"\nPhase I join rate (canonical): {p1_hit:.4f}  | rows={len(p1_join)}")
print("Missing examples (Phase I):")
display(p1_join[p1_join["_merge"] != "both"][["sig_id"]].head(10))

# Phase II 连接
meta_p2 = meta[meta["phase"] == "GSE70138"].copy()
p2_join = meta_p2.merge(sidx2, on="sig_id", how="left", validate="m:1", indicator=True)
p2_hit = float((p2_join["_merge"] == "both").mean())
print(f"\nPhase II join rate (canonical): {p2_hit:.4f} | rows={len(p2_join)}")
print("Missing examples (Phase II):")
display(p2_join[p2_join["_merge"] != "both"][["sig_id"]].head(10))


Phase counts in meta:
phase
GSE92742    108383
GSE70138    101157
Name: count, dtype: int64

Phase I join rate (canonical): 1.0000  | rows=108383
Missing examples (Phase I):


Unnamed: 0,sig_id



Phase II join rate (canonical): 1.0000 | rows=101157
Missing examples (Phase II):


Unnamed: 0,sig_id


In [24]:
keep_cols_meta = [
    "sig_id", "phase", "pert_id", "pert_iname",
    "cell_id", "time_h",
    "dose_uM", "dose_value", "dose_unit_raw",
    "smiles", "smiles_canonical", "inchi_key", "compound_id",
    "is_small_molecule", "is_control", "pert_type"
]

# Phase I 命中
p1_ok = p1_join[p1_join["_merge"] == "both"].copy()
p1_ok = p1_ok.rename(columns={"pos": "col_pos"})
p1_ok["dataset"] = "GSE92742"
p1_ok = p1_ok[keep_cols_meta + ["col_pos", "dataset"]]

# Phase II 命中
p2_ok = p2_join[p2_join["_merge"] == "both"].copy()
p2_ok = p2_ok.rename(columns={"pos": "col_pos"})
p2_ok["dataset"] = "GSE70138"
p2_ok = p2_ok[keep_cols_meta + ["col_pos", "dataset"]]

# 合并
train_catalog = pd.concat([p1_ok, p2_ok], ignore_index=True)
train_catalog_path = f"{OUT}/train_catalog_canonical.parquet"
train_catalog.to_parquet(train_catalog_path, index=False)

print("Saved:", train_catalog_path, "| shape:", train_catalog.shape)
display(train_catalog.head(3))


Saved: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/train_catalog_canonical.parquet | shape: (209540, 18)


Unnamed: 0,sig_id,phase,pert_id,pert_iname,cell_id,time_h,dose_uM,dose_value,dose_unit_raw,smiles,smiles_canonical,inchi_key,compound_id,is_small_molecule,is_control,pert_type,col_pos,dataset
0,AML001_CD34_24H:BRD-A03772856:0.37037,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,0.37037,0.37037,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434316,GSE92742
1,AML001_CD34_24H:BRD-A03772856:1.11111,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,1.11111,1.11111,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434315,GSE92742
2,AML001_CD34_24H:BRD-A03772856:10,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,10.0,10.0,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434313,GSE92742


In [25]:
train_catalog

Unnamed: 0,sig_id,phase,pert_id,pert_iname,cell_id,time_h,dose_uM,dose_value,dose_unit_raw,smiles,smiles_canonical,inchi_key,compound_id,is_small_molecule,is_control,pert_type,col_pos,dataset
0,AML001_CD34_24H:BRD-A03772856:0.37037,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,0.37037,0.37037,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434316,GSE92742
1,AML001_CD34_24H:BRD-A03772856:1.11111,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,1.11111,1.11111,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434315,GSE92742
2,AML001_CD34_24H:BRD-A03772856:10,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,10.00000,10.00000,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434313,GSE92742
3,AML001_CD34_24H:BRD-A03772856:3.33333,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,3.33333,3.33333,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434314,GSE92742
4,AML001_CD34_24H:BRD-A19037878:1.11111,GSE92742,BRD-A19037878,trichostatin-a,cd34,24.0,1.11111,1.11111,µM,CC(\C=C(C)\C=C\C(=O)NO)C(=O)c1ccc(cc1)N(C)C,CC(C=CC(=O)NO)=CC(C)C(=O)c1ccc(N(C)C)cc1,RTKIYFITIVXBLE-WKWSCTOISA-N,RTKIYFITIVXBLE-WKWSCTOISA-N,True,False,trt_cp,434463,GSE92742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209535,REP.A028_YAPC_24H:K09,GSE70138,BRD-K60230970,MG-132,yapc,24.0,20.00000,20.00000,um,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,CC(C)CC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=...,TZYWCYJVHRLUCT-VABKMULXSA-N,TZYWCYJVHRLUCT-VABKMULXSA-N,True,False,trt_cp,73903,GSE70138
209536,REP.A028_YAPC_24H:M18,GSE70138,BRD-K96862998,pirfenidone,yapc,24.0,0.04000,0.04000,um,Cc1ccc(=O)n(c1)-c1ccccc1,Cc1ccc(=O)n(-c2ccccc2)c1,ISWRGOKTTBVCFA-UHFFFAOYSA-N,ISWRGOKTTBVCFA-UHFFFAOYSA-N,True,False,trt_cp,73904,GSE70138
209537,REP.A028_YAPC_24H:O01,GSE70138,BRD-K60230970,MG-132,yapc,24.0,20.00000,20.00000,um,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,CC(C)CC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=...,TZYWCYJVHRLUCT-VABKMULXSA-N,TZYWCYJVHRLUCT-VABKMULXSA-N,True,False,trt_cp,73905,GSE70138
209538,REP.A028_YAPC_24H:O06,GSE70138,BRD-K60230970,MG-132,yapc,24.0,20.00000,20.00000,um,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CC(...,CC(C)CC(C=O)NC(=O)C(CC(C)C)NC(=O)C(CC(C)C)NC(=...,TZYWCYJVHRLUCT-VABKMULXSA-N,TZYWCYJVHRLUCT-VABKMULXSA-N,True,False,trt_cp,73906,GSE70138


In [26]:
qc = {
    "n_rows": int(len(train_catalog)),
    "n_unique_sig": int(train_catalog["sig_id"].nunique()),
    "n_phase1": int((train_catalog["dataset"] == "GSE92742").sum()),
    "n_phase2": int((train_catalog["dataset"] == "GSE70138").sum()),
    "cells": int(train_catalog["cell_id"].nunique()),
    "smiles_non_null_ratio": float(train_catalog["smiles_canonical"].notna().mean()),
    "trt_cp_ratio": float((train_catalog["pert_type"] == "trt_cp").mean()) if "pert_type" in train_catalog else None,
    "no_controls": bool((train_catalog["is_control"] == False).all()) if "is_control" in train_catalog else None,
}
print(qc)

# 看看时间和剂量的基本分布
print("\nTime (h) describe:")
display(train_catalog["time_h"].describe())
print("Dose (uM) describe:")
display(train_catalog["dose_uM"].describe())


{'n_rows': 209540, 'n_unique_sig': 209540, 'n_phase1': 108383, 'n_phase2': 101157, 'cells': 42, 'smiles_non_null_ratio': 1.0, 'trt_cp_ratio': 1.0, 'no_controls': True}

Time (h) describe:


count    209540.0
mean         24.0
std           0.0
min          24.0
25%          24.0
50%          24.0
75%          24.0
max          24.0
Name: time_h, dtype: float64

Dose (uM) describe:


count    209540.000000
mean          5.516054
std           7.312951
min           0.000300
25%           0.370000
50%           5.000000
75%          10.000000
max         177.600000
Name: dose_uM, dtype: float64

In [27]:
# === 路径 ===
import os, pandas as pd, numpy as np

BASE = "/ShangGaoAIProjects/Lingge/LINCS/data"
OUT_DIR = f"{BASE}/Processed_data/L1000gctx_process"

GCTX_PHASE1 = f"{BASE}/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"
GCTX_PHASE2 = f"{BASE}/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx"

# Phase I 的 gene_info（包含 pr_is_lm 标记）
GENE_INFO_P1 = f"{BASE}/GSE92742/GSE92742_Broad_LINCS_gene_info.txt.gz"

# Phase I 的基因索引（行顺序）
GIDX_P1_PATH = f"{OUT_DIR}/GSE92742_gene_index.parquet"

# === 读取与对齐 ===
gene_info_p1 = pd.read_csv(GENE_INFO_P1, sep="\t")
gidx1 = pd.read_parquet(GIDX_P1_PATH)   # columns: gene_id, pos, dataset

# 将 ID 类型统一为字符串，避免类型不匹配
gidx1["gene_id_str"] = gidx1["gene_id"].astype(str)
if "pr_gene_id" in gene_info_p1.columns:
    gene_info_p1["gene_id_str"] = gene_info_p1["pr_gene_id"].astype(str)
elif "gene_id" in gene_info_p1.columns:
    gene_info_p1["gene_id_str"] = gene_info_p1["gene_id"].astype(str)
else:
    raise ValueError("gene_info 缺少 pr_gene_id / gene_id 列，无法定位 Landmark。")

# === 取 Landmark 列表（pr_is_lm == 1） ===
assert "pr_is_lm" in gene_info_p1.columns, "gene_info 缺少 pr_is_lm 列。"
lm_ids = (
    gene_info_p1.loc[gene_info_p1["pr_is_lm"] == 1, "gene_id_str"]
    .drop_duplicates()
    .tolist()
)
print("Landmark gene count (expect ~978):", len(lm_ids))

# === 映射到 Phase I 行位置，并按 Phase I 顺序升序排列 ===
lm_df = (
    pd.DataFrame({"gene_id_str": lm_ids})
    .merge(gidx1[["gene_id_str", "pos"]], on="gene_id_str", how="inner")
    .sort_values("pos")
)
landmark_idx_p1 = lm_df["pos"].to_numpy(dtype=np.int32)
print("Landmark idx length:", len(landmark_idx_p1))
print("First 10 row positions:", landmark_idx_p1[:10])

# === 保存（后续抽取标签要用） ===
np.save(f"{OUT_DIR}/landmark_idx_phase1.npy", landmark_idx_p1)
lm_df[["gene_id_str"]].to_csv(f"{OUT_DIR}/landmark_gene_ids_phase1.txt", index=False, header=False)
print("Saved:")
print(" ", f"{OUT_DIR}/landmark_idx_phase1.npy")
print(" ", f"{OUT_DIR}/landmark_gene_ids_phase1.txt")


Landmark gene count (expect ~978): 978
Landmark idx length: 978
First 10 row positions: [0 1 2 3 4 5 6 7 8 9]
Saved:
  /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/landmark_idx_phase1.npy
  /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/landmark_gene_ids_phase1.txt


In [28]:
# === 路径 ===
import pandas as pd, numpy as np

OUT_DIR = "/ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process"

UNION_PATH = f"{OUT_DIR}/gene_index_union.parquet"   # 之前导出的并集/交集索引
GIDX_P1_PATH = f"{OUT_DIR}/GSE92742_gene_index.parquet"

# === 读取并检查列 ===
gene_union = pd.read_parquet(UNION_PATH)
need_cols = {"gene_id", "pos_92742", "pos_70138"}
missing = need_cols - set(gene_union.columns)
assert not missing, f"gene_index_union.parquet 缺少列: {missing}"

# 读取 Phase I 基因索引，拿到基因总数（应为 12328）
gidx1 = pd.read_parquet(GIDX_P1_PATH)
n_genes_p1 = int(gidx1["pos"].max() + 1)
print("Phase I gene count (rows):", n_genes_p1)

# === 只保留两期共有基因，并按 Phase I 行序排序 ===
common = gene_union.dropna(subset=["pos_92742", "pos_70138"]).copy()
common["pos_92742"] = common["pos_92742"].astype(int)
common["pos_70138"] = common["pos_70138"].astype(int)
common = common.sort_values("pos_92742")

# === 构建 Phase II → Phase I 的行重排索引
# p2_row_order[p1_pos] = 对应 Phase II 的行号
p2_row_order = np.empty(n_genes_p1, dtype=np.int32)
p2_row_order.fill(-1)
p2_row_order[common["pos_92742"].to_numpy()] = common["pos_70138"].to_numpy()

# 如果两期基因集合完全一致，此处不应出现 -1
assert (p2_row_order >= 0).all(), "存在无法映射到 Phase II 的行号，请检查 gene_index_union。"

# === 保存重排索引 ===
np.save(f"{OUT_DIR}/phase2_row_reorder_index.npy", p2_row_order)
print("Saved:", f"{OUT_DIR}/phase2_row_reorder_index.npy")

# === 生成 Phase II 坐标系下的 Landmark 行号 ===
landmark_idx_p1 = np.load(f"{OUT_DIR}/landmark_idx_phase1.npy")          # P1 顺序下的 978 行
landmark_idx_p2 = p2_row_order[landmark_idx_p1]                           # 映射到 P2 的行号
np.save(f"{OUT_DIR}/landmark_idx_phase2_in_p2coords.npy", landmark_idx_p2)
print("Saved:", f"{OUT_DIR}/landmark_idx_phase2_in_p2coords.npy")
print("Phase II landmark idx length:", len(landmark_idx_p2))
print("First 10 (in P2 coords):", landmark_idx_p2[:10])


Phase I gene count (rows): 12328
Saved: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/phase2_row_reorder_index.npy
Saved: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/landmark_idx_phase2_in_p2coords.npy
Phase II landmark idx length: 978
First 10 (in P2 coords): [  204 12028   569  2281    55  2919  7585  7842   501  7597]


In [47]:
import os, pandas as pd

BASE    = "/ShangGaoAIProjects/Lingge/LINCS/data"
PROC    = f"{BASE}/Processed_data"
OUT_DIR = f"{PROC}/L1000gctx_process"

# 目录（按需二选一）
CATALOG_PATH = f"{OUT_DIR}/train_catalog_canonical.parquet"           # 全量
# CATALOG_PATH = f"{OUT_DIR}/train_catalog_canonical_24h.parquet"     # 如果你之后只想 24h 就改用这一行

CAT = pd.read_parquet(CATALOG_PATH)
print("Catalog:", CATALOG_PATH, "| shape:", CAT.shape)
display(CAT.head(3))


Catalog: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/train_catalog_canonical.parquet | shape: (209540, 18)


Unnamed: 0,sig_id,phase,pert_id,pert_iname,cell_id,time_h,dose_uM,dose_value,dose_unit_raw,smiles,smiles_canonical,inchi_key,compound_id,is_small_molecule,is_control,pert_type,col_pos,dataset
0,AML001_CD34_24H:BRD-A03772856:0.37037,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,0.37037,0.37037,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434316,GSE92742
1,AML001_CD34_24H:BRD-A03772856:1.11111,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,1.11111,1.11111,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434315,GSE92742
2,AML001_CD34_24H:BRD-A03772856:10,GSE92742,BRD-A03772856,BRD-A03772856,cd34,24.0,10.0,10.0,µM,COc1ccccc1C2N(C(=O)C3CCCN23)c4ccc(Cl)cc4,COc1ccccc1C1N(c2ccc(Cl)cc2)C(=O)C2CCCN21,YSPMFQJSWDVJME-UHFFFAOYSA-N,YSPMFQJSWDVJME-UHFFFAOYSA-N,True,False,trt_cp,434313,GSE92742


In [48]:
import h5py, numpy as np, pandas as pd, os

BASE    = "/ShangGaoAIProjects/Lingge/LINCS/data"
PROC    = f"{BASE}/Processed_data"
OUT_DIR = f"{PROC}/L1000gctx_process"

GCTX_P1 = f"{BASE}/GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"
GCTX_P2 = f"{BASE}/GSE70138/GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328_2017_03_06.gctx"

CATALOG_PATH = f"{OUT_DIR}/train_catalog_canonical.parquet"  # 或换成 24h 版本
CAT = pd.read_parquet(CATALOG_PATH)

lm_idx_p1 = np.load(f"{OUT_DIR}/landmark_idx_phase1.npy")
lm_idx_p2 = np.load(f"{OUT_DIR}/landmark_idx_phase2_in_p2coords.npy")

N = len(CAT)
D = len(lm_idx_p1)  # 978
mm_name = f"Y_landmark_{os.path.splitext(os.path.basename(CATALOG_PATH))[0]}.mmap"
Y_path = f"{OUT_DIR}/{mm_name}"
Y = np.memmap(Y_path, mode="w+", dtype="float32", shape=(N, D))
print("Create memmap:", Y_path, "| shape:", Y.shape)

idx_p1 = np.where(CAT["dataset"].to_numpy() == "GSE92742")[0]
idx_p2 = np.where(CAT["dataset"].to_numpy() == "GSE70138")[0]
colpos_p1 = CAT.loc[idx_p1, "col_pos"].astype(int).to_numpy()
colpos_p2 = CAT.loc[idx_p2, "col_pos"].astype(int).to_numpy()

batch = 2048  # 如内存宽裕可调到 4096；不够就降到 1024

# —— 抽 Phase I —— #
if len(idx_p1) > 0:
    with h5py.File(GCTX_P1, "r") as f1:
        mat1 = f1["0"]["DATA"]["0"]["matrix"]   # (n_sigs, 12328)
        for start in range(0, len(colpos_p1), batch):
            end  = min(start + batch, len(colpos_p1))

            # 这批样本在 CAT 中的目标行 & 对应的签名 row 索引
            dest_rows  = idx_p1[start:end]              # 写回 Y 的行（保持与 CAT 对齐）
            sig_rows   = colpos_p1[start:end].astype(np.int64)

            # 关键修正：按 h5py 要求，行索引需递增
            sort_idx   = np.argsort(sig_rows)
            sig_sorted = sig_rows[sort_idx]

            # 先取 (B, 12328)，再选 978 列
            tmp_sorted = mat1[sig_sorted, :]                   # (B, 12328)
            blk_sorted = tmp_sorted[:, lm_idx_p1].astype(np.float32)  # (B, 978)

            # 还原到原 batch 顺序
            inv = np.argsort(sort_idx)
            block = blk_sorted[inv, :]                         # (B, 978)

            # 写回对应的全局行
            Y[dest_rows, :] = block

# —— 抽 Phase II —— #
if len(idx_p2) > 0:
    with h5py.File(GCTX_P2, "r") as f2:
        mat2 = f2["0"]["DATA"]["0"]["matrix"]
        for start in range(0, len(colpos_p2), batch):
            end  = min(start + batch, len(colpos_p2))

            dest_rows  = idx_p2[start:end]
            sig_rows   = colpos_p2[start:end].astype(np.int64)

            sort_idx   = np.argsort(sig_rows)
            sig_sorted = sig_rows[sort_idx]

            tmp_sorted = mat2[sig_sorted, :]                         # (B, 12328)
            blk_sorted = tmp_sorted[:, lm_idx_p2].astype(np.float32) # (B, 978)

            inv = np.argsort(sort_idx)
            block = blk_sorted[inv, :]

            Y[dest_rows, :] = block

Y.flush()
print("Finished writing memmap:", Y_path)



Create memmap: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/Y_landmark_train_catalog_canonical.mmap | shape: (209540, 978)
Finished writing memmap: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/Y_landmark_train_catalog_canonical.mmap


In [49]:
import numpy as np
import pandas as pd

OUT_DIR = "/ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process"

# 重新以只读方式 mmap
Y = np.memmap(f"{OUT_DIR}/{mm_name}", mode="r", dtype="float32", shape=(len(CAT), D))

print("Y shape:", Y.shape)
print("Y global stats:")
print("  mean:", float(Y.mean()))
print("  std :", float(Y.std()))
print("  min :", float(Y.min()))
print("  max :", float(Y.max()))

# 保存侧车索引（方便溯源与训练时 join）
sidecar_path = f"{OUT_DIR}/Y_landmark_{os.path.splitext(os.path.basename(CATALOG_PATH))[0]}.index.parquet"
CAT.to_parquet(sidecar_path, index=False)
print("Saved sidecar index:", sidecar_path)


Y shape: (209540, 978)
Y global stats:
  mean: 0.0024143001064658165
  std : 1.3626172542572021
  min : -10.000001907348633
  max : 10.000001907348633
Saved sidecar index: /ShangGaoAIProjects/Lingge/LINCS/data/Processed_data/L1000gctx_process/Y_landmark_train_catalog_canonical.index.parquet
