In [1]:
import pandas as pd
import os

# Paths
BULK_PARQUET   = "../../data/processed/bulk_log1p.parquet"   # bulk full voom matrix
SC_PARQUET     = "../../data/processed/breast_sc_log1pCP10k__symbols_sidm_mapped.parquet"  # sc mapped to symbols
OVERLAP_CSV    = "hvg_overlap_outputs/HVG_overlap_bulk_sc.csv"    # overlap list (symbols)

OUTDIR = "../../data/filtered_datasets"
os.makedirs(OUTDIR, exist_ok=True)

OUT_BULK = os.path.join(OUTDIR, "bulk_overlap_genes.parquet")
OUT_SC   = os.path.join(OUTDIR, "sc_overlap_genes.parquet")


In [2]:
# Bulk expression (samples × genes)
bulk = pd.read_parquet(BULK_PARQUET)
print("Bulk shape:", bulk.shape)

# sc expression (cells × genes)
#sc = pd.read_parquet(SC_PARQUET)
#print("sc shape:", sc.shape)

# Overlap gene list
overlap = pd.read_csv(OVERLAP_CSV).iloc[:,0].astype(str).tolist()
print("Overlap genes:", len(overlap))


Bulk shape: (1362, 37602)
Overlap genes: 2044


In [3]:
# Load SIDG -> symbol mapping
mapping = pd.read_csv("../../../../maps/gene_id_to_symbol_mapping.csv")
sym2sidg = dict(zip(mapping["symbol"], mapping["gene_id"]))


In [5]:
# Vectorized reindexing of columns
#sc = sc.rename(columns=sym2sidg)

# Drop columns that failed to map (those still strings instead of IDs)
#sc = sc.loc[:, sc.columns.isin(mapping["gene_id"])]

#print("sc shape after remapping to SIDG IDs:", sc.shape)


In [6]:
# Now filter both bulk and sc to the overlap genes
overlap = pd.read_csv(OVERLAP_CSV).iloc[:,0].astype(str).tolist()

bulk_filtered = bulk.loc[:, [g for g in overlap if g in bulk.columns]]
#sc_filtered   = sc.loc[:,   [g for g in overlap if g in sc.columns]]

# Enforce same order
bulk_filtered = bulk_filtered[overlap]
#sc_filtered   = sc_filtered[overlap]

print("Filtered bulk shape:", bulk_filtered.shape)
#print("Filtered sc shape:", sc_filtered.shape)

# Sanity check
#assert list(bulk_filtered.columns) == list(sc_filtered.columns)


Filtered bulk shape: (1362, 2044)


In [7]:
bulk_filtered.to_parquet(OUT_BULK)
#sc_filtered.to_parquet(OUT_SC)

print("Saved bulk:", OUT_BULK)
print("Saved sc:", OUT_SC)


Saved bulk: ../../data/filtered_datasets/bulk_overlap_genes.parquet
Saved sc: ../../data/filtered_datasets/sc_overlap_genes.parquet
