In [40]:
import pandas as pd

gdsc1 = pd.read_parquet("../../data/original/gdsc1_fitted_dose_response.parquet")
gdsc2 = pd.read_parquet("../../data/original/gdsc2_fitted_dose_response.parquet")

gdsc1["SOURCE"] = "GDSC1"
gdsc2["SOURCE"] = "GDSC2"

gdsc_all = pd.concat([gdsc1, gdsc2], ignore_index=True)
print("‚úÖ Loaded:", gdsc_all.shape)


‚úÖ Loaded: (575197, 20)


In [41]:
# Find drug‚Äìcell pairs that appear in both datasets
pair_sources = gdsc_all.groupby(["DRUG_ID", "SANGER_MODEL_ID"])["SOURCE"].nunique().reset_index()
dup_pairs = pair_sources[pair_sources["SOURCE"] > 1]

# Now get only the drugs that appear in duplicated pairs
dup_drugs = dup_pairs["DRUG_ID"].unique()
print("üîÅ Number of drugs with overlapping cell lines:", len(dup_drugs))


üîÅ Number of drugs with overlapping cell lines: 76


In [42]:
# Start new IDs after max DRUG_ID
max_drug_id = gdsc_all["DRUG_ID"].max()
next_id = max_drug_id + 1

# Assign one new ID per DRUG_ID that needs splitting
drug_id_map = {}

for drug in dup_drugs:
    drug_id_map[drug] = next_id
    next_id += 1


In [43]:
# Preserve original
gdsc_all["DRUG_ID_ORIGINAL"] = gdsc_all["DRUG_ID"]

def split_drug_id(row):
    if row["SOURCE"] == "GDSC2" and row["DRUG_ID_ORIGINAL"] in drug_id_map:
        return drug_id_map[row["DRUG_ID_ORIGINAL"]]
    return row["DRUG_ID_ORIGINAL"]

gdsc_all["DRUG_ID"] = gdsc_all.apply(split_drug_id, axis=1)


In [44]:
print("üß™ Original unique DRUG_IDs:", gdsc_all["DRUG_ID_ORIGINAL"].nunique())
print("üÜï Final unique DRUG_IDs after split:", gdsc_all["DRUG_ID"].nunique())
print("‚ûï Number of reassigned IDs:", len(drug_id_map))

gdsc_all.to_parquet("../../data/processed/gdsc_with_split_drug_ids.parquet", index=False)
print("üìÅ Saved cleaned data to: ../../data/processed/gdsc_with_split_drug_ids.parquet")


üß™ Original unique DRUG_IDs: 621
üÜï Final unique DRUG_IDs after split: 697
‚ûï Number of reassigned IDs: 76
üìÅ Saved cleaned data to: ../../data/processed/gdsc_with_split_drug_ids.parquet
