In [1]:
import pandas as pd

# Load used model IDs
bulk_all = pd.read_csv("../../data/processed/pancancer_embeddings_30_pcs.csv")
used_models = bulk_all["SANGER_MODEL_ID"].unique()

# Load growth dataset
growth_df = pd.read_csv("../../data/original/growth_rate.csv")

# Clean + Filter to used models
growth_df = growth_df[growth_df["model_id"].isin(used_models)].copy()
growth_df["day4_day1_ratio"] = pd.to_numeric(growth_df["day4_day1_ratio"], errors="coerce")

print(f"✅ Filtered growth dataset shape: {growth_df.shape}")


✅ Filtered growth dataset shape: (148, 7)


In [2]:
# Prioritize higher replicates and keep best per model
growth_cleaned = (
    growth_df.sort_values("replicates", ascending=False)
    .groupby("model_id")["day4_day1_ratio"]
    .first()
    .reset_index()
    .rename(columns={"model_id": "SANGER_MODEL_ID"})
)

print(f"✅ Cleaned growth rate shape: {growth_cleaned.shape}")
growth_cleaned.head()


✅ Cleaned growth rate shape: (138, 2)


Unnamed: 0,SANGER_MODEL_ID,day4_day1_ratio
0,SIDM00078,3.46
1,SIDM00080,1.99
2,SIDM00082,0.98
3,SIDM00088,2.18
4,SIDM00092,1.83


In [3]:
import os
os.makedirs("feature_engineering", exist_ok=True)

output_path = "feature_engineering/processed_growth_day4_ratio.csv"
growth_cleaned.to_csv(output_path, index=False)

print(f"📁 Saved growth rate features to: {output_path}")


📁 Saved growth rate features to: feature_engineering/processed_growth_day4_ratio.csv
