In [2]:
import os
import re
import joblib
import numpy as np
import pandas as pd

# paths (edit if needed)
SC_PARQUET   = "../../data/filtered_datasets/breast_cancer_embeddings.parquet"
BULK_PARQUET = "../../data/gdsc_bulk_embeddings_voom.parquet"
MODEL_DIR    = "embeddings"   # where your *.joblib are saved


In [3]:
sc = pd.read_parquet(SC_PARQUET)
sc_cols = [c for c in sc.columns if re.fullmatch(r"emb_\d+", str(c))]
print("SC embedding columns:", len(sc_cols))

def inspect_bundle(path):
    b = joblib.load(path)
    feats = b.get("feature_cols") or b.get("gene_cols") or b.get("emb_cols")
    if feats is None:
        raise KeyError(f"{path} has no feature list (expected 'feature_cols' or 'gene_cols').")
    feats = [str(c) for c in feats]
    missing = [c for c in feats if c not in sc_cols]
    extra   = [c for c in sc_cols if c not in feats]
    print(f"• {os.path.basename(path)} -> needs {len(feats)} feats | missing {len(missing)} | extra {len(extra)}")
    if missing[:8]:
        print("   first missing:", missing[:8])
    return len(missing), len(extra)

bundle_stats = []
for p in sorted(os.listdir(MODEL_DIR)):
    if p.endswith(".joblib"):
        m, e = inspect_bundle(os.path.join(MODEL_DIR, p))
        bundle_stats.append((p, m, e))

if not bundle_stats:
    print("No .joblib models found in", MODEL_DIR)


SC embedding columns: 3072
• elasticnet_drug1089_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug1096_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug1526_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug1845_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug1931_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug2038_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug2508_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug2515_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug2540_log1p.joblib -> needs 3072 feats | missing 0 | extra 0
• elasticnet_drug427_log1p.joblib -> needs 3072 feats | missing 0 | extra 0


In [4]:
# pick one model to probe (edit to a file that exists)
one_model = None
for p in sorted(os.listdir(MODEL_DIR)):
    if p.endswith(".joblib"):
        one_model = os.path.join(MODEL_DIR, p)
        break
assert one_model is not None, "No model found."

print("Using:", os.path.basename(one_model))
bundle = joblib.load(one_model)
pipe   = bundle["pipeline"]
feats  = [str(c) for c in (bundle.get("feature_cols") or bundle.get("gene_cols"))]

# load bulk rows for the same features
bulk = pd.read_parquet(BULK_PARQUET)
Xb = bulk[feats].astype("float32").values
Xs = sc[feats].astype("float32").values  # will raise KeyError if features missing

# transform with pipeline steps
imputer = pipe.named_steps.get("impute")
scaler  = pipe.named_steps.get("scale")

if imputer is not None:
    Xb = imputer.transform(Xb)
    Xs = imputer.transform(Xs)
if scaler is not None:
    Xb_z = scaler.transform(Xb)
    Xs_z = scaler.transform(Xs)
else:
    Xb_z, Xs_z = Xb, Xs

def describe(name, Z):
    Z = Z[np.isfinite(Z)]
    print(f"{name}: mean={Z.mean():.3f}, std={Z.std():.3f}, |z|>5 frac={(np.abs(Z)>5).mean():.3%}")

describe("bulk z", Xb_z)
describe("sc   z", Xs_z)


Using: elasticnet_drug1089_log1p.joblib
bulk z: mean=0.000, std=0.998, |z|>5 frac=0.168%
sc   z: mean=0.551, std=12.274, |z|>5 frac=65.815%


In [5]:
# parse any drug id from filename like ...drug1845...
m = re.search(r"drug(\d+)", os.path.basename(one_model))
drug_id = int(m.group(1)) if m else None
print("Parsed drug_id:", drug_id)

if drug_id is not None and "DRUG_ID" in bulk.columns and "LN_IC50" in bulk.columns:
    dfb = bulk.loc[bulk["DRUG_ID"] == drug_id].dropna(subset=["LN_IC50"])
    Xb  = dfb[feats].astype("float32").values
    yb  = dfb["LN_IC50"].values
    yhat = pipe.predict(Xb)
    print(f"bulk true  mean±sd: {yb.mean():.3f} ± {yb.std():.3f}")
    print(f"bulk pred  mean±sd: {yhat.mean():.3f} ± {yhat.std():.3f}")
else:
    print("Could not test bulk prediction range (missing DRUG_ID/LN_IC50 or id parse).")


Parsed drug_id: 1089
bulk true  mean±sd: 3.804 ± 1.736
bulk pred  mean±sd: 3.804 ± 1.084


In [13]:
# --- edit paths if needed ---
BULK_PARQUET = "../../data/filtered_datasets/bulk_embeddings_voom.parquet"          # bulk embeddings (per line)
SC_PARQUET   = "../../data/filtered_datasets/breast_cancer_embeddings.parquet"  # single-cell embeddings
OUT_DIR      = "out/plots_emb"
# --------------------------------

import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

os.makedirs(OUT_DIR, exist_ok=True)

# Try to import UMAP (optional)
try:
    import umap
    HAS_UMAP = True
except Exception:
    HAS_UMAP = False
    print("⚠️ umap-learn not found; will only run t-SNE. Install with: pip install umap-learn")


In [14]:
# Load single-cell embeddings
sc = pd.read_parquet(SC_PARQUET)

# Ensure SANGER_MODEL_ID is present as a column (or index) & cast to str
if "SANGER_MODEL_ID" in sc.columns:
    sc["SANGER_MODEL_ID"] = sc["SANGER_MODEL_ID"].astype(str)
else:
    sc = sc.copy()
    sc.index = sc.index.astype(str)
    sc["SANGER_MODEL_ID"] = sc.index

# Identify embedding columns (emb_###)
emb_cols_sc = [c for c in sc.columns if re.fullmatch(r"emb_\d+", str(c))]
assert len(emb_cols_sc) > 0, "No emb_### columns found in SC parquet."

# Breast-cancer lines present in SC
bc_lines = sc["SANGER_MODEL_ID"].unique().astype(str)

# Load bulk embeddings and filter to those lines
bulk = pd.read_parquet(BULK_PARQUET)
bulk["SANGER_MODEL_ID"] = bulk["SANGER_MODEL_ID"].astype(str)

# Identify embedding columns in bulk (should match SC)
emb_cols_bulk = [c for c in bulk.columns if re.fullmatch(r"emb_\d+", str(c))]
assert len(emb_cols_bulk) > 0, "No emb_### columns found in bulk parquet."

# Align feature schema (intersection, ordered by name)
emb_cols = sorted(set(emb_cols_sc).intersection(emb_cols_bulk))
assert len(emb_cols) > 0, "No overlapping embedding columns between SC and bulk."

# Filter to breast-cancer lines
bulk_bc = bulk.loc[bulk["SANGER_MODEL_ID"].isin(bc_lines), ["SANGER_MODEL_ID"] + emb_cols].copy()

print(f"Breast-cancer lines in SC: {len(bc_lines)}")
print(f"Bulk lines overlapping with SC: {bulk_bc['SANGER_MODEL_ID'].nunique()}")
print(f"Using {len(emb_cols)} shared embedding features.")


Breast-cancer lines in SC: 28
Bulk lines overlapping with SC: 28
Using 3072 shared embedding features.


In [15]:
# (A) Pseudo-bulk (per-line mean of SC embeddings)
sc_pb = (
    sc.groupby("SANGER_MODEL_ID")[emb_cols]
    .mean()
    .reset_index()
    .assign(modality="SC_mean")
)

# (B) Subsample per-cell SC for visualization (keeps spread)
#     You can raise/lower CELLS_PER_LINE for larger/smaller plots.
CELLS_PER_LINE = 100  # 0 to skip per-cell scatter
def subsample_cells(df, key="SANGER_MODEL_ID", n=100):
    parts = []
    for line, sub in df.groupby(key):
        if n <= 0:
            continue
        take = sub.sample(min(n, len(sub)), random_state=42)
        parts.append(take)
    return pd.concat(parts, axis=0) if parts else pd.DataFrame(columns=df.columns)

sc_cells_sub = subsample_cells(sc[["SANGER_MODEL_ID"] + emb_cols], n=CELLS_PER_LINE)
if len(sc_cells_sub):
    sc_cells_sub = sc_cells_sub.assign(modality="SC_cell")

# Bulk block (one point per line)
bulk_block = bulk_bc.assign(modality="BULK")

# Concatenate what we want to visualize
viz_df = pd.concat([bulk_block, sc_pb, sc_cells_sub], axis=0, ignore_index=True)
print(viz_df["modality"].value_counts())
viz_df.head()


modality
SC_cell    2789
BULK         28
SC_mean      28
Name: count, dtype: int64


Unnamed: 0,SANGER_MODEL_ID,emb_000,emb_001,emb_002,emb_003,emb_004,emb_005,emb_006,emb_007,emb_008,...,emb_991,emb_992,emb_993,emb_994,emb_995,emb_996,emb_997,emb_998,emb_999,modality
0,SIDM00097,-0.674053,1.973369,0.660372,0.904649,-0.966278,-0.159582,1.155603,-1.436503,-0.066184,...,-0.157283,0.817895,-1.366784,0.324417,-0.050555,0.923206,2.701583,1.076968,0.173005,BULK
1,SIDM00122,-0.815378,1.754815,0.807438,0.996977,-0.998065,-0.395482,1.069174,-1.250717,-0.281092,...,-0.399067,0.671727,-1.6199,0.309441,-0.39253,0.844497,2.647812,1.242498,0.269794,BULK
2,SIDM00135,-0.848479,1.718834,0.802478,1.080877,-1.090048,-0.391177,1.049456,-1.288212,-0.244145,...,-0.485325,0.672618,-1.57913,0.383446,-0.438844,0.765527,2.690419,1.119272,0.219548,BULK
3,SIDM00148,-0.78992,1.878227,0.67834,1.104195,-1.01588,-0.324984,1.115078,-1.26692,-0.022967,...,-0.367772,0.733806,-1.511164,0.552563,-0.072372,0.776538,2.659569,1.212346,-0.093398,BULK
4,SIDM00272,-0.80754,1.84778,0.743537,1.081425,-1.031805,-0.404933,1.079759,-1.235994,-0.165143,...,-0.314622,0.689712,-1.601412,0.485431,-0.276045,0.788555,2.656567,1.179114,0.166491,BULK


In [16]:
# Standardize jointly so t-SNE/UMAP see comparable scales
X = viz_df[emb_cols].to_numpy(dtype=np.float32)
scaler = StandardScaler()
Xz = scaler.fit_transform(X)

# PCA to speed up t-SNE/UMAP & denoise a bit
PCA_DIM = min(100, Xz.shape[1])
Xp = PCA(n_components=PCA_DIM, random_state=42).fit_transform(Xz)

# t-SNE (run time depends on number of points; PCA-D helps)
tsne = TSNE(n_components=2, perplexity=30, learning_rate="auto", init="pca", random_state=42)
X_tsne = tsne.fit_transform(Xp)

viz_df_tsne = viz_df.copy()
viz_df_tsne["tsne_1"] = X_tsne[:,0]
viz_df_tsne["tsne_2"] = X_tsne[:,1]

# Plot
plt.figure(figsize=(7,6))
for mod, mk in [("BULK","o"), ("SC_mean","^"), ("SC_cell",".")]:
    sub = viz_df_tsne[viz_df_tsne["modality"]==mod]
    if len(sub)==0: 
        continue
    plt.scatter(sub["tsne_1"], sub["tsne_2"], s=18 if mod!="SC_cell" else 6, marker=mk, alpha=0.7, label=mod)
plt.legend()
plt.title("t-SNE of embeddings (breast-cancer lines): bulk vs single-cell")
plt.xlabel("t-SNE 1"); plt.ylabel("t-SNE 2")
out_tsne = os.path.join(OUT_DIR, "tsne_bulk_vs_sc.png")
plt.tight_layout(); plt.savefig(out_tsne, dpi=150); plt.close()
print("Saved:", out_tsne)


Saved: out/plots_emb/tsne_bulk_vs_sc.png


In [17]:
if HAS_UMAP:
    reducer = umap.UMAP(n_neighbors=30, min_dist=0.15, n_components=2, random_state=42, metric="euclidean")
    X_umap = reducer.fit_transform(Xp)

    viz_df_umap = viz_df.copy()
    viz_df_umap["umap_1"] = X_umap[:,0]
    viz_df_umap["umap_2"] = X_umap[:,1]

    plt.figure(figsize=(7,6))
    for mod, mk in [("BULK","o"), ("SC_mean","^"), ("SC_cell",".")]:
        sub = viz_df_umap[viz_df_umap["modality"]==mod]
        if len(sub)==0: 
            continue
        plt.scatter(sub["umap_1"], sub["umap_2"], s=18 if mod!="SC_cell" else 6, marker=mk, alpha=0.7, label=mod)
    plt.legend()
    plt.title("UMAP of embeddings (breast-cancer lines): bulk vs single-cell")
    plt.xlabel("UMAP 1"); plt.ylabel("UMAP 2")
    out_umap = os.path.join(OUT_DIR, "umap_bulk_vs_sc.png")
    plt.tight_layout(); plt.savefig(out_umap, dpi=150); plt.close()
    print("Saved:", out_umap)


  warn(


Saved: out/plots_emb/umap_bulk_vs_sc.png
