In [5]:
import pandas as pd

# Paths
MAIN_DATASET_PATH = "../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs_tissue.parquet"
GROWTH_PATH = "feature_engineering/processed_growth_day4_ratio.csv"

# Load data
df_main = pd.read_parquet(MAIN_DATASET_PATH)
growth_df = pd.read_csv(GROWTH_PATH)

print(f"📦 Main training dataset: {df_main.shape}")
print(f"📈 Growth ratio dataset: {growth_df.shape}")


📦 Main training dataset: (571985, 78)
📈 Growth ratio dataset: (958, 2)


In [6]:
# Keep only rows with available growth data
filtered_df = df_main[df_main["SANGER_MODEL_ID"].isin(growth_df["SANGER_MODEL_ID"])]

print(f"📉 Filtered dataset shape (with growth info only): {filtered_df.shape}")


📉 Filtered dataset shape (with growth info only): (566992, 78)


In [7]:
# Merge growth info
merged_df = pd.merge(filtered_df, growth_df, on="SANGER_MODEL_ID", how="left")

# Confirm no missing growth values remain
missing = merged_df["day4_day1_ratio"].isna().sum()
assert missing == 0, f"There are still {missing} missing growth values after filtering."
print(f"✅ All cell lines have growth values after filtering.")


✅ All cell lines have growth values after filtering.


In [8]:
OUTPUT_PATH = "../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs_tissue_growth.parquet"
merged_df.to_parquet(OUTPUT_PATH, index=False)

print(f"✅ Final enriched training dataset saved to:\n{OUTPUT_PATH}")


✅ Final enriched training dataset saved to:
../../data/processed/bulk_voom_30_pcs_embeddings_30_pcs_tissue_growth.parquet
