In [None]:
import os
import pandas as pd
import numpy as np
import scanpy as sc

# Files
H5AD_FEATURES = "../../data/processed/breast_cancer_dimred.h5ad"   # has adata.var['highly_deviant']
SC_PARQUET_SYM = "../../data/processed/breast_sc_log1pCP10k__symbols_mapped.parquet"  # sc (cells×genes) with SYMBOL columns
BULK_VOOM      = "../../data/processed/voom_transformed_data.parquet"                 # bulk (samples×genes) with SIDG columns

OUTDIR = "scry_hvg_outputs"
os.makedirs(OUTDIR, exist_ok=True)


In [2]:
adata = sc.read(H5AD_FEATURES)
var = adata.var.copy()


In [3]:
assert ("highly_deviant" in var.columns) or ("binomial_deviance" in var.columns), \
    "Esperava 'highly_deviant' ou 'binomial_deviance' em adata.var."

# Prefer the boolean mask you saved:
if "highly_deviant" in var.columns:
    mask = var["highly_deviant"].astype(bool).values
else:
    # Fallback: take top 4000 by binomial deviance
    top_k = 4000
    mask = np.zeros(len(var), dtype=bool)
    mask[np.argsort(var["binomial_deviance"].values)[-top_k:]] = True

# Get symbols robustly
if "gene_symbol" in var.columns:
    symbols = pd.Index(var.loc[mask, "gene_symbol"].astype(str))
else:
    # If var_names are already symbols, this is a no-op
    symbols = pd.Index(adata.var_names.astype(str))[mask]

# Deduplicate while preserving order
symbols = pd.Index(pd.unique(symbols))
print("Genes (símbolos) selecionados pelo scry:", len(symbols))

# Save symbol list
symbols.to_series().to_csv(os.path.join(OUTDIR, "scry_genes_symbols.csv"), index=False, header=["symbol"])


Genes (símbolos) selecionados pelo scry: 3999


In [None]:
SIDG_MAP_CSV   = "../../../../maps/gene_id_to_symbol_mapping.csv"                           # columns: gene_id (SIDG...), symbol
# Load SIDG mapping and build symbol->SIDG dict
m = pd.read_csv(SIDG_MAP_CSV)
sym2sidg = dict(zip(m["symbol"].astype(str), m["gene_id"].astype(str)))

sidg_list = symbols.map(sym2sidg).dropna().unique().tolist()
print("Scry genes mapeados para SIDG:", len(sidg_list))

# Bulk genes (SIDG space)
bulk = pd.read_parquet(BULK_VOOM)
bulk_cols = bulk.columns.astype(str)

overlap_sidg = [g for g in sidg_list if g in bulk_cols]
print("Overlap com bulk (SIDG):", len(overlap_sidg))

# Save overlap lists
pd.Series(overlap_sidg, name="SIDG").to_csv(os.path.join(OUTDIR, "scry_bulk_overlap_SIDG.csv"), index=False)
pd.Series(symbols, name="symbol").to_csv(os.path.join(OUTDIR, "scry_symbols_all.csv"), index=False)


Scry genes mapeados para SIDG: 3952
Overlap com bulk (SIDG): 3952


: 

In [None]:
# BULK filtered (SIDG), columns in overlap order
bulk_f = bulk.loc[:, overlap_sidg]
bulk_f.to_parquet(os.path.join(OUTDIR, "bulk_scry_overlap_SIDG.parquet"))
print("Saved bulk filtered:", bulk_f.shape)

# SC filtered by symbols (keep only those that map into overlap_sidg)
sc_sym = pd.read_parquet(SC_PARQUET_SYM)
print("Single-cell (símbolos) original:", sc_sym.shape)

# Determine the symbol order corresponding to overlap_sidg
sidg2sym = {v:k for k,v in sym2sidg.items()}
overlap_symbols = [sidg2sym[s] for s in overlap_sidg if s in sidg2sym]

# Keep available symbols only
overlap_symbols_present = [g for g in overlap_symbols if g in sc_sym.columns]

sc_sym_f = sc_sym.loc[:, overlap_symbols_present]
sc_sym_f.to_parquet(os.path.join(OUTDIR, "sc_scry_overlap_SYMBOLS.parquet"))
print("Saved sc (símbolos) filtered:", sc_sym_f.shape)

# Also produce an SC matrix with SIDG columns (to plug directly into bulk-trained models)
sc_sidg_cols = [sym2sidg.get(g) for g in sc_sym_f.columns]
sc_sidg = sc_sym_f.copy()
sc_sidg.columns = sc_sidg_cols

# Align to overlap_sidg order and drop any NAs/missing
sc_sidg = sc_sidg.loc[:, [g for g in overlap_sidg if g in sc_sidg.columns]]
# Enforce exact same order as bulk_f
sc_sidg = sc_sidg[overlap_sidg]

sc_sidg.to_parquet(os.path.join(OUTDIR, "sc_scry_overlap_SIDG.parquet"))
print("Saved sc (SIDG) filtered/aligned:", sc_sidg.shape)

# Final sanity check: identical column order with bulk_f
assert list(sc_sidg.columns) == list(bulk_f.columns)
print("✅ Column order aligned between bulk and sc.")


Saved bulk filtered: (1362, 3952)
