In [1]:
import pandas as pd

# Paths
EMBEDDINGS_PATH = "../../data/processed/gdsc_pancancer_embeddings_expression_30_pcs.parquet"  # or raw_embeddings if needed
TISSUE_PATH = "feature_engineering/processed_tissue_dummies.csv"

# Load datasets
df_embed = pd.read_parquet(EMBEDDINGS_PATH)
tissue_df = pd.read_csv(TISSUE_PATH)


In [2]:
# Merge on SANGER_MODEL_ID
merged_df = pd.merge(df_embed, tissue_df, on="SANGER_MODEL_ID", how="left")

# Check if there are missing tissue assignments
tissue_cols = [col for col in merged_df.columns if col.startswith("Tissue_")]
missing_tissues = merged_df[tissue_cols].isna().sum().sum()
print(f"⚠️ Missing tissue dummy entries: {missing_tissues}")

# Fill NaNs (if any) with 0
merged_df[tissue_cols] = merged_df[tissue_cols].fillna(0).astype(int)

print("✅ Tissue features successfully merged and cleaned.")

print(merged_df.head())


⚠️ Missing tissue dummy entries: 0
✅ Tissue features successfully merged and cleaned.
  SANGER_MODEL_ID  DRUG_ID   LN_IC50         0         1         2          3  \
0       SIDM01111        1  2.033564 -2.465400  1.202431 -0.944438  -9.463418   
1       SIDM00965        1  3.047965 -3.944899  0.260618 -0.084734  -5.278831   
2       SIDM00921        1  3.009075 -2.428348  3.057883 -6.248041   0.675703   
3       SIDM00395        1  3.381088 -2.083047  9.835143  3.641217 -11.338659   
4       SIDM00369        1 -1.395483  7.560895  4.969285  0.769833   4.234478   

           4         5         6  ...  Tissue_Central Nervous System  \
0   7.780813 -1.961875 -1.021320  ...                              0   
1  11.148554 -5.640492  2.199688  ...                              0   
2   0.930572 -1.085604  2.428111  ...                              0   
3   4.221368 -9.624701 -8.043923  ...                              0   
4  -0.730354 -3.121336  1.328352  ...                              

In [3]:
OUTPUT_PATH = "../../data/processed/gdsc_pancancer_embeddings_expression_30_pcs_tissue.parquet"
merged_df.to_parquet(OUTPUT_PATH, index=False)

print(f"📁 Saved dataset with tissue features to:\n{OUTPUT_PATH}")


📁 Saved dataset with tissue features to:
../../data/processed/gdsc_pancancer_embeddings_expression_30_pcs_tissue.parquet
