In [1]:
# %% [markdown]
# 📦 Shared output directory for all artifacts (Pancancer + Tahoe)

from pathlib import Path
BASE_DIR = Path("../../data/processed/xfer_shared")  # <- change if you want
BASE_DIR.mkdir(parents=True, exist_ok=True)
print("Saving all artifacts under:", BASE_DIR.resolve())


Saving all artifacts under: /home/guests3/gba/thesis_implementation/thesis_implementation/single_cell/pan_cancer/data/processed/xfer_shared


In [None]:
# %% [markdown]
# 🧬 Pancancer: PCA on single-cell expression → per-cell PCs → pseudo-bulk PCs (mean per group)

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, IncrementalPCA
import joblib
import scanpy as sc

# ===== CONFIG =====
EXP_PATH    = "../../data/processed/pancancer_sc_expression.parquet"  # cells × genes (wide)
META_PATH   = "../../data/processed/pancancer_dimred.h5ad"
GROUP_KEY   = "SIDM"            # how to aggregate to pseudo-bulk (SANGER-like group)
N_PCS       = 30                # fixed at 30 PCs
USE_INCREMENTAL = False         # flip to True if memory is tight
BATCH_SIZE  = 20000             # only used if USE_INCREMENTAL=True

# ----- Load & align -----
print("📥 Loading expression + AnnData...")
X = pd.read_parquet(EXP_PATH)            # cells × genes
adata = sc.read(META_PATH)
cell_ids = adata.obs_names.to_numpy()
groups = adata.obs[GROUP_KEY].to_numpy()



📥 Loading expression + AnnData...


In [5]:
from collections import Counter

obs_names = adata.obs_names.astype(str)


print(f"🧬 EXP raw shape: {X.shape} (rows × cols)")
print(f"🧬 AnnData cells: {adata.n_obs:,}")

# ---- Peek at EXP table ----
print("\n🧾 EXP index:")
print("  name:", X.index.name, "| dtype:", getattr(X.index, "dtype", None))
print("  first 5 index values:", list(map(str, X.index[:5])) if len(X.index) else [])

print("\n🧾 EXP columns (first 10):")
print(list(X.columns[:10]))
print("  # numeric columns:", X.select_dtypes(include=[np.number]).shape[1])
print("  # object columns :", X.select_dtypes(exclude=[np.number]).shape[1])

# If a likely cell-id column exists, show a preview
likely_id_cols = [c for c in X.columns if c.lower() in {
    "cell_id","cellid","barcode","cell","obs_names","obsname","cell_barcode","cellbarcode"
}]
if likely_id_cols:
    print("\n🔎 Found candidate ID columns in EXP:", likely_id_cols)
    for c in likely_id_cols:
        print(f"  {c}: first 5 ->", X[c].astype(str).head().tolist())
else:
    print("\nℹ️ No obvious ID column names detected in EXP (we'll probe string columns).")

# ---- AnnData previews ----
print("\n📇 AnnData obs_names (first 5):", obs_names[:5].tolist())
print("🧪 GROUP_KEY value counts (top 5):")
print(adata.obs[GROUP_KEY].value_counts().head())

# ---- Overlap diagnostics ----
# 1) overlap between EXP index and obs_names
index_overlap = len(set(map(str, X.index)) & set(obs_names))
print(f"\n🔗 Overlap: EXP.index ∩ obs_names = {index_overlap}")

# 2) overlap between EXP columns and obs_names (signals transposed matrix)
cols_overlap = len(set(map(str, X.columns)) & set(obs_names))
print(f"🔗 Overlap: EXP.columns ∩ obs_names = {cols_overlap}")

# 3) try all short string-like columns as candidate ID columns
str_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
cand_scores = []
for c in str_cols:
    vals = X[c].astype(str)
    ov = len(set(vals) & set(obs_names))
    if ov > 0:
        cand_scores.append((c, ov))
cand_scores.sort(key=lambda t: t[1], reverse=True)

if cand_scores:
    print("\n🏷️ Candidate ID columns ranked by overlap with obs_names:")
    for c, ov in cand_scores[:10]:
        print(f"  {c:30s} -> overlap {ov}")
else:
    print("\n⚠️ No string column in EXP overlaps with obs_names.")

# ---- Duplicate checks ----
if X.index.is_unique:
    print("\n✅ EXP index is unique")
else:
    dup_idx = [k for k, v in Counter(map(str, X.index)).items() if v > 1][:5]
    print("\n⚠️ EXP index has duplicates. Examples:", dup_idx)

if adata.obs_names.is_unique:
    print("✅ AnnData obs_names is unique")
else:
    dup_obs = [k for k, v in Counter(map(str, adata.obs_names)).items() if v > 1][:5]
    print("⚠️ AnnData obs_names has duplicates. Examples:", dup_obs)

# ---- Small samples to inspect visually ----
print("\n🧪 Sample of EXP index (5):", list(map(str, X.index[:5])) if len(X.index) else [])
print("🧪 Sample of EXP columns (5):", list(map(str, X.columns[:5])))

# If there is some overlap, show a few matching IDs (index or columns)
if index_overlap > 0:
    matches = list(set(map(str, X.index)) & set(obs_names))
    print("✅ Example matching IDs (index):", matches[:5])
if cols_overlap > 0:
    matches_c = list(set(map(str, X.columns)) & set(obs_names))
    print("✅ Example matching IDs (columns):", matches_c[:5])

print("\n🟩 Done printing. Send me the cell output above and I'll adapt the PCA code accordingly.")

🧬 EXP raw shape: (39715, 30314) (rows × cols)
🧬 AnnData cells: 39,715

🧾 EXP index:
  name: SIDM | dtype: category
  first 5 index values: ['SIDM00890', 'SIDM00965', 'SIDM00333', 'SIDM01101', 'SIDM00722']

🧾 EXP columns (first 10):
['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2ML1-AS1', 'A2ML1-AS2', 'A3GALT2', 'A4GALT']
  # numeric columns: 30314
  # object columns : 0

ℹ️ No obvious ID column names detected in EXP (we'll probe string columns).

📇 AnnData obs_names (first 5): ['C32_SKIN', 'NCIH446_LUNG', 'MFE319_ENDOMETRIUM', 'SKNAS_AUTONOMIC_GANGLIA', 'NCIH2452_PLEURA']
🧪 GROUP_KEY value counts (top 5):
SIDM
SIDM00703    1994
SIDM00144    1185
SIDM01082    1079
SIDM01060     833
SIDM00806     832
Name: count, dtype: int64

🔗 Overlap: EXP.index ∩ obs_names = 0
🔗 Overlap: EXP.columns ∩ obs_names = 0

⚠️ No string column in EXP overlaps with obs_names.

⚠️ EXP index has duplicates. Examples: ['SIDM00890', 'SIDM00965', 'SIDM00333', 'SIDM01101', 'SIDM00722']
✅ AnnData obs_name

In [6]:
# X index holds the SIDM group IDs (duplicated), not per-cell IDs.
# We'll use that as the grouping label and skip obs_names alignment.
# Keep only numeric gene columns and cast to float32 for memory.

# Keep only numeric genes
X = X.select_dtypes(include=[np.number]).astype(np.float32)

# Use SIDM from the index as group labels (safer given your printout)
if str(getattr(X.index, "name", "")).upper() == "SIDM":
    groups = X.index.astype(str).to_numpy()
else:
    # Fallback: rely on AnnData if index isn't SIDM
    assert len(X) == adata.n_obs, "Row count mismatch; cannot attach group labels."
    groups = adata.obs[GROUP_KEY].astype(str).to_numpy()

print(f"✅ Matrix for PCA: {X.shape[0]:,} cells × {X.shape[1]:,} genes")
print(f"✅ Group vector length: {len(groups):,} (e.g., {np.unique(groups)[:5]})")

# ----- Fit PCA (fixed 30 PCs) -----
if USE_INCREMENTAL:
    pca = IncrementalPCA(n_components=N_PCS, batch_size=BATCH_SIZE)
    for start in range(0, X.shape[0], BATCH_SIZE):
        pca.partial_fit(X.iloc[start:start+BATCH_SIZE].values)
    PCs_list = []
    for start in range(0, X.shape[0], BATCH_SIZE):
        PCs_list.append(pca.transform(X.iloc[start:start+BATCH_SIZE].values))
    PCs = np.vstack(PCs_list)
else:
    # randomized SVD is faster and lighter for tall/wide matrices
    pca = PCA(n_components=N_PCS, svd_solver="randomized", random_state=13)  # no whitening
    PCs = pca.fit_transform(X.values)

pc_cols = [f"PC{i}" for i in range(1, N_PCS + 1)]
pc_df = pd.DataFrame(PCs, index=X.index, columns=pc_cols)
pc_df["group"] = groups

# ----- Pseudo-bulk aggregation (mean per group) -----
pseudo_bulk = (pc_df
               .groupby("group")[pc_cols]
               .mean()
               .rename_axis("SANGER_MODEL_ID")
               .reset_index())

# ----- Save everything important to BASE_DIR -----
(pc_df.drop(columns=["group"])
      .to_parquet(BASE_DIR / "pancancer_cell_PCs_30.parquet"))
pseudo_bulk.to_parquet(BASE_DIR / "pancancer_pseudobulk_30PCs.parquet")
joblib.dump(pca, BASE_DIR / "pancancer_expr_pca.joblib")
pd.Series(X.columns, name="gene").to_csv(BASE_DIR / "pancancer_gene_columns.txt",
                                         index=False, header=False)

print("💾 Saved:",
      (BASE_DIR / "pancancer_cell_PCs_30.parquet").name,
      (BASE_DIR / "pancancer_pseudobulk_30PCs.parquet").name,
      (BASE_DIR / "pancancer_expr_pca.joblib").name,
      (BASE_DIR / "pancancer_gene_columns.txt").name)


✅ Matrix for PCA: 39,715 cells × 30,314 genes
✅ Group vector length: 39,715 (e.g., ['SIDM00078' 'SIDM00080' 'SIDM00082' 'SIDM00088' 'SIDM00092'])
💾 Saved: pancancer_cell_PCs_30.parquet pancancer_pseudobulk_30PCs.parquet pancancer_expr_pca.joblib pancancer_gene_columns.txt
