In [None]:
import pandas as pd

# Load pseudo-bulk dataset
pseudo_bulk = pd.read_csv("../../data/processed/pancancer_pseudobulk_30_pcs_tissue_growth.csv")

# Extract model IDs and clean them
single_cell_ids = pseudo_bulk.iloc[:, 0].astype(str).str.strip().str.upper().unique()

print(f"✅ Extracted {len(single_cell_ids)} unique single-cell model IDs")


✅ Extracted 138 unique single-cell model IDs


In [6]:
# Load bulk datasets
bulk_1 = pd.read_parquet("../../../../bulk_state_of_the_art/data/processed/bulk_with_pca.parquet")
bulk_2 = pd.read_parquet("../../../../bulk_state_of_the_art/data/processed/bulk_with_pca_embeddings.parquet")

# Clean SANGER_MODEL_IDs
bulk_1["SANGER_MODEL_ID"] = bulk_1["SANGER_MODEL_ID"].str.strip().str.upper()
bulk_2["SANGER_MODEL_ID"] = bulk_2["SANGER_MODEL_ID"].str.strip().str.upper()

print(f"📦 bulk_with_pca_tissue_growth: {bulk_1.shape}")
print(f"📦 bulk_with_pca_embeddings_tissue_growth: {bulk_2.shape}")

# Print number of columns
print(f"📦 bulk_with_pca_tissue_growth columns: {len(bulk_1.columns)}")
print(f"📦 bulk_with_pca_embeddings_tissue_growth columns: {len(bulk_2.columns)}")

# Compare columns
cols_1 = set(bulk_1.columns)
cols_2 = set(bulk_2.columns)

only_in_1 = cols_1 - cols_2
only_in_2 = cols_2 - cols_1

print("\n🟢 Columns only in bulk_with_pca_tissue_growth:")
print(sorted(only_in_1))

print("\n🔵 Columns only in bulk_with_pca_embeddings_tissue_growth:")
print(sorted(only_in_2))



📦 bulk_with_pca_tissue_growth: (571985, 33)
📦 bulk_with_pca_embeddings_tissue_growth: (571985, 33)
📦 bulk_with_pca_tissue_growth columns: 33
📦 bulk_with_pca_embeddings_tissue_growth columns: 33

🟢 Columns only in bulk_with_pca_tissue_growth:
[]

🔵 Columns only in bulk_with_pca_embeddings_tissue_growth:
[]


In [7]:
bulk_1_aligned = bulk_1[bulk_1["SANGER_MODEL_ID"].isin(single_cell_ids)].copy()
bulk_2_aligned = bulk_2[bulk_2["SANGER_MODEL_ID"].isin(single_cell_ids)].copy()

print(f"✅ Aligned bulk_1 shape: {bulk_1_aligned.shape}")
print(f"✅ Aligned bulk_2 shape: {bulk_2_aligned.shape}")


✅ Aligned bulk_1 shape: (83624, 33)
✅ Aligned bulk_2 shape: (83624, 33)


In [8]:
bulk_1_aligned.to_parquet("../../data/processed/bulk_30_pcs.parquet", index=False)
bulk_2_aligned.to_parquet("../../data/processed/bulk_embeddings_30_pcs.parquet", index=False)

print("💾 Saved aligned datasets for single-cell training.")


💾 Saved aligned datasets for single-cell training.
