In [5]:
import pandas as pd

# Paths
EMBEDDINGS_PATH = "../../data/processed/bulk_embeddings_30_pcs.parquet"  # or raw_embeddings if needed
TISSUE_PATH = "feature_engineering/processed_tissue_dummies.csv"

# Load datasets
df_embed = pd.read_parquet(EMBEDDINGS_PATH)
tissue_df = pd.read_csv(TISSUE_PATH)

print(f"🧬 Embeddings dataset shape: {df_embed.shape}")
print(f"🏷️ Tissue features shape: {tissue_df.shape}")


🧬 Embeddings dataset shape: (83624, 33)
🏷️ Tissue features shape: (140, 12)


In [6]:
# Merge on SANGER_MODEL_ID
merged_df = pd.merge(df_embed, tissue_df, on="SANGER_MODEL_ID", how="left")

# Check if there are missing tissue assignments
tissue_cols = [col for col in merged_df.columns if col.startswith("Tissue_")]
missing_tissues = merged_df[tissue_cols].isna().sum().sum()
print(f"⚠️ Missing tissue dummy entries: {missing_tissues}")

# Fill NaNs (if any) with 0
merged_df[tissue_cols] = merged_df[tissue_cols].fillna(0).astype(int)

print("✅ Tissue features successfully merged and cleaned.")

print(merged_df.head())


⚠️ Missing tissue dummy entries: 0
✅ Tissue features successfully merged and cleaned.
  SANGER_MODEL_ID  DRUG_ID   LN_IC50   SCF_PC1   SCF_PC2   SCF_PC3   SCF_PC4  \
0       SIDM01111        1  2.033564 -2.761284  5.308661 -1.074850 -0.946203   
1       SIDM00965        1  3.047965 -0.906100  3.386841 -0.006393 -0.650798   
2       SIDM00921        1  3.009075 -1.594771 -1.635437  1.053011 -0.952706   
3       SIDM00395        1  3.381088 -0.956791  2.133660 -1.688075 -0.554651   
4       SIDM00369        1 -1.395483 -0.319099 -2.202758 -1.629978 -0.385201   

    SCF_PC5   SCF_PC6   SCF_PC7  ...  Tissue_Central Nervous System  \
0 -0.200081 -1.026378  1.316409  ...                              0   
1  0.283100  0.112945  0.884774  ...                              0   
2  0.886739 -0.113859  0.029168  ...                              0   
3 -1.282332 -0.501545  0.873872  ...                              0   
4  1.636648  0.748600 -0.135878  ...                              0   

   Tis

In [7]:
OUTPUT_PATH = "../../data/processed/bulk_embeddings_30_pcs_tissue.parquet"
merged_df.to_parquet(OUTPUT_PATH, index=False)

print(f"📁 Saved dataset with tissue features to:\n{OUTPUT_PATH}")


📁 Saved dataset with tissue features to:
../../data/processed/bulk_embeddings_30_pcs_tissue.parquet
