In [4]:
import pandas as pd

# Paths
EMBEDDINGS_PATH = "../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs.parquet"  # or raw_embeddings if needed
TISSUE_PATH = "feature_engineering/processed_tissue_dummies.csv"

# Load datasets
df_embed = pd.read_parquet(EMBEDDINGS_PATH)
tissue_df = pd.read_csv(TISSUE_PATH)

print(f"🧬 Embeddings dataset shape: {df_embed.shape}")
print(f"🏷️ Tissue features shape: {tissue_df.shape}")


🧬 Embeddings dataset shape: (571985, 63)
🏷️ Tissue features shape: (1431, 16)


In [5]:
# Merge on SANGER_MODEL_ID
merged_df = pd.merge(df_embed, tissue_df, on="SANGER_MODEL_ID", how="left")

# Check if there are missing tissue assignments
tissue_cols = [col for col in merged_df.columns if col.startswith("Tissue_")]
missing_tissues = merged_df[tissue_cols].isna().sum().sum()
print(f"⚠️ Missing tissue dummy entries: {missing_tissues}")

# Fill NaNs (if any) with 0
merged_df[tissue_cols] = merged_df[tissue_cols].fillna(0).astype(int)

print("✅ Tissue features successfully merged and cleaned.")


⚠️ Missing tissue dummy entries: 0
✅ Tissue features successfully merged and cleaned.


In [7]:
OUTPUT_PATH = "../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs_tissue.parquet"
merged_df.to_parquet(OUTPUT_PATH, index=False)

print(f"📁 Saved dataset with tissue features to:\n{OUTPUT_PATH}")


📁 Saved dataset with tissue features to:
../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs_tissue.parquet
