We have the following problem: drug names are not unique in the respective dataframes. E.g., sometimes, they are using different synonyms, sometimes, there are typos, sometimes, there is random capitalization. The first resort was retrieving the PubChem IDs. However, this opened up new problems. PubChem IDs are not unique, as it turns out. This notebook tries to see how large the problem is.

In [1]:
import os
import pandas as pd
import pubchempy as pcp

In [3]:
all_datasets = {
    "CCLE": "/Users/judithbernett/PycharmProjects/drp_model_suite/data/CCLE/CCLE.csv",
    "CTRPv1": "/Users/judithbernett/PycharmProjects/drp_model_suite/data/CTRPv1/CTRPv1.csv",
    "CTRPv2": "/Users/judithbernett/PycharmProjects/drp_model_suite/data/CTRPv2/CTRPv2.csv",
    "GDSC1": "/Users/judithbernett/PycharmProjects/drp_model_suite/data/GDSC1/GDSC1.csv",
    "GDSC2": "/Users/judithbernett/PycharmProjects/drp_model_suite/data/GDSC2/GDSC2.csv",
}
all_drug_names = pd.DataFrame(columns=["pubchem_id","drug_name", "dataset"])
for dataset, file in all_datasets.items():
    df = pd.read_csv(file)
    df = df[["pubchem_id", "drug_name"]].drop_duplicates()
    df["dataset"] = dataset
    all_drug_names = pd.concat([all_drug_names, df])
all_drug_names

  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


Unnamed: 0,pubchem_id,drug_name,dataset
0,11656518,RAF265,CCLE
1,24180719,PLX-4720,CCLE
2,10127622,Selumetinib,CCLE
3,10302451,Saracatinib,CCLE
4,10461815,PHA-665752,CCLE
...,...,...,...
229715,3899,Leflunomide,GDSC2
231502,864,alpha-lipoic acid,GDSC2
232237,124886,glutathione,GDSC2
232969,12035,N-acetyl cysteine,GDSC2


We will now iterate through all drug names. For each drug name, we will retrieve the PubChem ID, the synonyms, the IUPAC name, the canonical smiles, the cactvs fingerprints, the fingerprint.

In [3]:
drug_names = set(all_drug_names["drug_name"])
iupac_hashmap = {}
no_drug_names = []
result_df = pd.DataFrame(columns=["drug_name", "pubchem_id", "iupac_name", "canonical_smiles", "cactvs_fingerprint", "fingerprint"])
for idx, drug_name in enumerate(drug_names):
    if idx % 10 == 0:
        print(f"Processing {idx} of {len(drug_names)}")
    try:
        compound = pcp.get_compounds(drug_name, namespace="name")
        if len(compound) == 0:
            raise ValueError(f"No compound found for {drug_name}")
        # if more than one compound is found, we will take the first match
        compound = compound[0]
        iupac_hash = hash(compound.iupac_name)
        if iupac_hash not in iupac_hashmap:
            iupac_hashmap[iupac_hash] = [drug_name]
        elif drug_name not in iupac_hashmap[iupac_hash]:
            print(f"Collision for {drug_name}: {iupac_hashmap[iupac_hash]}")
            iupac_hashmap[iupac_hash].append(drug_name)
        result_df = pd.concat([result_df, pd.DataFrame.from_dict({
            "drug_name": [drug_name],
            "pubchem_id": [compound.cid],
            "iupac_name": [compound.iupac_name],
            "canonical_smiles": [compound.canonical_smiles],
            "cactvs_fingerprint": [compound.cactvs_fingerprint],
            "fingerprint": [compound.fingerprint]
        })
        ])
    except Exception as e:
        print(f"Error for {drug_name}: {e}")
        if "No compound found" in str(e):
            no_drug_names.append(drug_name)

Processing 0 of 1253
Error for 1807: No compound found for 1807
Error for PARP_9495: No compound found for PARP_9495
Processing 10 of 1253
Error for TL-2-105: No compound found for TL-2-105
Processing 20 of 1253
Error for 615590: No compound found for 615590
Error for XMD15-27: No compound found for XMD15-27
Error for N24798-49-A1: No compound found for N24798-49-A1
Processing 30 of 1253
Error for 968: No compound found for 968
Error for Bleomycin (50 uM): No compound found for Bleomycin (50 uM)
Error for carboplatin:UNC0638 (2:1 mol/mol): No compound found for carboplatin:UNC0638 (2:1 mol/mol)
Error for Compound 44: No compound found for Compound 44
Processing 40 of 1253
Error for QW-BI-011: No compound found for QW-BI-011
Error for 2-bromopyruvate: No compound found for 2-bromopyruvate
Error for BRD4372: No compound found for BRD4372
Processing 50 of 1253
Error for BRD5802: No compound found for BRD5802
Error for FY026: No compound found for FY026
Error for BRD4470: No compound found

In [4]:
print("Overall stats: ")
print(f"Number of drug names: {len(drug_names)}")
print(f"Number of drug names with no compounds: {len(no_drug_names)}")
iupac_hash_collisions = sum([len(v) for v in iupac_hashmap.values() if len(v) > 1])
print(f"Number of IUPAC hash collisions: {iupac_hash_collisions}")

Overall stats: 
Number of drug names: 1253
Number of drug names with no compounds: 258
Number of IUPAC hash collisions: 241


In [5]:
iupac_hash_collisions = [v for v in iupac_hashmap.values() if len(v) > 1]

In [6]:
pd.DataFrame(no_drug_names, columns=["drug_name"]).to_csv("no_drug_names.csv", index=False)
pd.DataFrame(iupac_hash_collisions, columns=["drug_name_1", "drug_name_2", "drug_name_3"]).to_csv("iupac_hash_collisions.csv", index=False)

First task: harmonize the iupac collisions. Find out whether they also got different PubChem IDs.

In [7]:
import numpy as np
iupac_flat = pd.DataFrame(iupac_hash_collisions, columns=["drug_name_1", "drug_name_2", "drug_name_3"]).to_numpy().flatten()
result_subset = result_df[result_df["drug_name"].isin(iupac_flat)]
for collision_list in iupac_hash_collisions:
    pubchem_ids = result_subset[result_subset["drug_name"].isin(collision_list)]["pubchem_id"].unique()
    if len(pubchem_ids) > 1:
        print(f"Collision for {collision_list}: {pubchem_ids}")

Great news - they all got the same PubChemIDs. We will harmonize the names in a separate csv.

In [9]:
hash_collisions_resolved = pd.read_csv("iupac_hash_collisions_resolved.csv", index_col=0)
mapping_dict = {}
for idx, row in hash_collisions_resolved.iterrows():
    for drug_name in row:
        if not pd.isna(drug_name):
            mapping_dict[drug_name] = idx

Next issue: unmappable drug names. Let's see how many of them we found so far.

In [10]:
subsetted_drug_names = all_drug_names[all_drug_names["drug_name"].isin(no_drug_names)]
subsetted_drug_names = subsetted_drug_names.sort_values(by="drug_name")

In [11]:
subsetted_drug_names.to_csv("no_drug_names_results.csv", index=False)

We manually change some of these mappings and adapt the mapping dictionary accordingly.

In [12]:
new_mappings = {
    "(-)-gallocatechin-3-monogallate": "(-)-Gallocatechin gallate",
    "16-beta-bromoandrosterone": "16beta-Bromoandrosterone",
    "1S,3R-RSL-3": "(1S,3R)-Rsl3",
    "2,4-dideoxy-DC-45-A2": "BRD-K41087962-001-01-7",
    "2-bromopyruvate": "3-Bromopyruvic acid",
    "2-deoxyglucose": "2-Deoxy-D-arabino-hexopyranose",
    "4-methylfasudil": "5-(1,4-Diazepan-1-ylsulfonyl)-4-methylisoquinoline",
    "5-benzyl-9-tert-butyl-paullone": "CHEMBL575106",
    "968": "Glutaminase C-IN-1",
    "AA-COCF3": "Aacocf3",
    "BRD-A86708339": "TCMDC-125552",
    "BRD-K03536150": "SCHEMBL16273428",
    "BRD-K07442505": "BAM7",
    "BRD-K20514654": "BRD-K20514654-001-01-8",
    "BRD-K29313308": "BRD3308",
    "BRD-K35604418": "mim1",
    "BRD-K63431240": "BRD1240",
    "BRD-K64610608": "MLS003179190",
    "BRD-K70511574": "HMS3654N14",
    "BRD-K79669418": "CHEMBL5275075",
    "BRD-K88742110": "PCI-34051",
    "BRD0713": "(2R,3R,4S)-4-(hydroxymethyl)-3-phenyl-1-propylazetidine-2-carbonitrile",
    "BRD1812": "ICG-001",
    "BRD2572": "1-[(1R,5S)-7-phenyl-6-propyl-3,6-diazabicyclo[3.1.1]heptan-3-yl]ethanone",
    "BRD4046": "[(2S,3R,4R)-4-(aminomethyl)-3-phenyl-1-propylazetidin-2-yl]methanol",
    "BRD4372": "1-[(1S,2aS,8bS)-1-(hydroxymethyl)-2-propyl-1,2a,3,8b-tetrahydroazeto[2,3-c]quinolin-4-yl]ethanone",
    "BRD4470": "BRD-4470",
    "BRD55319": "BRD-K53855319-001-01-2",
    "BRD5586": "5-phenyl-1,7-dihydrotetrazolo[1,5-a]pyrimidine",
    "BRD63610": "BRD-K85563610-001-01-0",
    "BRD6368": "BRD-K14696368-001-01-8",
    "BRD6430": "Sonidegib",
    "BRD6825": "BRD-96825",
    "BRD7137": "5H-quinolino[8,7-c][1,2]benzothiazine 6,6-dioxide",
    "BRD8097": "(1S,5R)-3-methylsulfonyl-7-phenyl-6-propyl-3,6-diazabicyclo[3.1.1]heptane",
    "BRD8418": "(1R,5S)-7-phenyl-6-propyl-3,6-diazabicyclo[3.1.1]heptane",
    "BRD8958": "C646",
    "Brivanib, BMS-540215": "Brivanib:BMS-540215",
    "CAP-232, TT-232, TLN-232": "CAP-232:TT-232:TLN-232",
    "CID 2853753": "2-(1,2-Dihydroimidazo[1,2-a]benzimidazol-4-yl)-1-(4-phenylphenyl)ethanone",
    "Cetuximab": "C225",
    "Compound 110": "BRD-K03618428-001-01-3",
    "Compound 11e": "VX-11e",
    "Compound 12": "BRD-K40892394-001-01-9",
    "Compound 2": "BCATc Inhibitor 2",
    "Compound 4": "PTP1B-IN-3",
    "Compound 44": "CHEMBL5270701",
    "DC-45-A2": "BRD-K79983625-001-01-1",
    "ISOX": "Bml-281",
    "KRAS (G12C) Inhibitor-12": "K-Ras(G12C) inhibitor 12",
    "ML334 diastereomer": "BRD-K93367411-001-03-3",
    "MPS-1-IN-1": "Mps1-IN-1",
    "MetAP2 Inhibitor, A832234": "MetAP2 Inhibitor:A832234",
    "Nutlin-3a (-)": "Rebemadlin",
    "P-0850": "BRAF inhibitor",
    "PP-30": "BRD-K30677119-001-01-0",
    "Picolinici-acid": "Picolinic acid",
    "QW-BI-011": "BRD-4770",
    "SR-II-138A": "Rohinitib",
    "T-5345967": "ML-031",
    "TTNBP": "Arotinoid acid",
    "VAF-347": "Vaf347",
    "Venotoclax": "Venetoclax",
    "YL54": "BRD-K58306044-001-01-3",
    "ascorbate (vitamin C)": "Ascorbic acid",
    "ceranib-2": "CHEMBL4788167",
    "compound 1B": "Dnmdp",
    "eEF2K Inhibitor, A-484954": "a-484954",
    "erastin-A8": "SCHEMBL4462685",
    "m-3M3-FBS": "Phospholipase",
    "racemic-2,4-dideoxy-DC-45-A2": "SCHEMBL18710180",
    "tipifarnib-P1": "Tipifarnib S enantiomer",
    "tipifarnib-P2": "Tipifarnib",
}

In [13]:
mapping_dict.update(new_mappings)

In [14]:
no_drug_names_resolved = pd.read_csv("no_drug_names_resolved.csv", index_col=0)

Let's concatenate the resolved drug names to the result dataframe.

In [15]:
new_df = result_df.copy()
new_collisions = {}
for idx, row in no_drug_names_resolved.iterrows():
    try:
        compound = pcp.get_compounds(idx, namespace="cid")
        if len(compound) == 0:
            raise ValueError(f"No compound found for {row['drug_name']}")
        compound = compound[0]
        iupac_hash = hash(compound.iupac_name)
        if (iupac_hash in iupac_hashmap) and (mapping_dict.get(row['drug_name'], row['drug_name']) not in iupac_hashmap[iupac_hash]):
            new_name = mapping_dict.get(row['drug_name'], row['drug_name'])
            print(f"Collision for {new_name}: {iupac_hashmap[iupac_hash]}")
            new_collisions[iupac_hash] = iupac_hashmap[iupac_hash].append(row['drug_name'])
        new_df = pd.concat([new_df, pd.DataFrame.from_dict({
            "drug_name": [row['drug_name']],
            "pubchem_id": [compound.cid],
            "iupac_name": [compound.iupac_name],
            "canonical_smiles": [compound.canonical_smiles],
            "cactvs_fingerprint": [compound.cactvs_fingerprint],
            "fingerprint": [compound.fingerprint]
        })
        ])
    except Exception as e:
        print(f"{row['drug_name']}: {e}")
        

1205: 'PUGREST.BadRequest'
123138: 'PUGREST.BadRequest'
123829: 'PUGREST.BadRequest'
150412: 'PUGREST.BadRequest'
1807: 'PUGREST.BadRequest'
1818: 'PUGREST.BadRequest'
50869: 'PUGREST.BadRequest'
615590: 'PUGREST.BadRequest'
630600: 'PUGREST.BadRequest'
667880: 'PUGREST.BadRequest'
720427: 'PUGREST.BadRequest'
729189: 'PUGREST.BadRequest'
741909: 'PUGREST.BadRequest'
743380: 'PUGREST.BadRequest'
765771: 'PUGREST.BadRequest'
776928: 'PUGREST.BadRequest'
965-D2: 'PUGREST.BadRequest'
993-D2: 'PUGREST.BadRequest'
AZD7969: 'PUGREST.BadRequest'
BAY ACCi: 'PUGREST.BadRequest'
BAY AKT1: 'PUGREST.BadRequest'
BAY-HDAC11_1: 'PUGREST.BadRequest'
BAY-HDAC11_2: 'PUGREST.BadRequest'
BAY-HDAC11_4: 'PUGREST.BadRequest'
BAY-MPS-combo 2 (paclitaxel 1 uM): 'PUGREST.BadRequest'
BAY-MPS-combo-1 (paclitaxel 5 uM): 'PUGREST.BadRequest'
BAY-MPS1: 'PUGREST.BadRequest'
BDF00022089a: 'PUGREST.BadRequest'
BDILV000379a: 'PUGREST.BadRequest'
BDOCA000347a: 'PUGREST.BadRequest'
BDP-00009066: 'PUGREST.BadRequest'
BPD-0

In [16]:
# change the mapping dict for collisions
mapping_dict.update({
    "erismodegib": "Sonidegib",
    "CAY10603": "Bml-281",
    "BRD4770": "BRD-4770",
    "ML031": "ML-031",
})

In [17]:
no_drug_names_rest = no_drug_names_resolved[no_drug_names_resolved.index.isin([drug_id for drug_id in no_drug_names_resolved.index if drug_id not in new_df["pubchem_id"].values.astype(str)])]

For CTRPv1, CTRPv2, we have smiles, even if we don't have Pubchem IDs. We shall merge

In [18]:
smiles_ctrpv1 = pd.read_csv("../CTRP/response/CTRPv1.0_2013_pub_Cell_154_1151/v10.M1.informer_set.txt", sep="\t")[["cpd_name", "cpd_smiles"]].drop_duplicates()
smiles_ctrpv2 = pd.read_csv("../CTRP/response/CTRPv2.0_2015_ctd2_ExpandedDataset/v20.meta.per_compound.txt", sep="\t")[["cpd_name", "cpd_smiles"]].drop_duplicates()
all_smiles = pd.concat([smiles_ctrpv1, smiles_ctrpv2]).drop_duplicates()

In [19]:
all_smiles = all_smiles[all_smiles["cpd_name"].isin(no_drug_names_rest["drug_name"])]

In [20]:
all_smiles.columns = ["drug_name", "canonical_smiles"]

In [21]:
no_drug_names_rest = no_drug_names_rest.reset_index()

In [22]:
no_drug_names_rest = no_drug_names_rest.merge(all_smiles, on="drug_name", how="left")

In [23]:
no_drug_names_rest["fingerprint"] = np.nan
no_drug_names_rest["iupac_name"] = np.nan
no_drug_names_rest["cactvs_fingerprint"] = np.nan
no_drug_names_rest["fingerprint"] = np.nan

In [24]:
new_df = pd.concat([new_df, no_drug_names_rest])

In [25]:
# some last minute mappings
new_df[new_df["drug_name"] == "JNK-9L"] = ["JNK-9L", 25222038, "4-(3-fluoro-5-morpholin-4-ylphenyl)-N-[4-(3-morpholin-4-yl-1,2,4-triazol-1-yl)phenyl]pyrimidin-2-amine", "C1COCCN1C2=CC(=CC(=C2)C3=NC(=NC=C3)NC4=CC=C(C=C4)N5C=NC(=N5)N6CCOCC6)F", np.nan, np.nan, "GDSC1"]
new_df[new_df["drug_name"] == "Bleomycin (10 uM)"] = ["Bleomycin (10 uM)", "Bleomycin (10 uM)", "3-[[2-[2-[2-[[(2S,3R)-2-[[(2S,3S,4R)-4-[[(2S,3R)-2-[[6-amino-2-[(1S)-3-amino-1-[[(2S)-2,3-diamino-3-oxopropyl]amino]-3-oxopropyl]-5-methylpyrimidine-4-carbonyl]amino]-3-[3-[4-carbamoyloxy-3,5-dihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy-4,5-dihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy-3-(1H-imidazol-5-yl)propanoyl]amino]-3-hydroxy-2-methylpentanoyl]amino]-3-hydroxybutanoyl]amino]ethyl]-1,3-thiazol-4-yl]-1,3-thiazole-4-carbonyl]amino]propyl-dimethylsulfanium", "CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)C(=O)N[C@@H]([C@H](C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)N[C@H](C)[C@H]([C@H](C)C(=O)N[C@@H]([C@@H](C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O", np.nan, np.nan, "GDSC1"]

new_df[(new_df["drug_name"] == "Bleomycin (50 uM)") & (new_df["dataset"] == "GDSC1")] = ["Bleomycin (50 uM)", "Bleomycin (50 uM)", "3-[[2-[2-[2-[[(2S,3R)-2-[[(2S,3S,4R)-4-[[(2S,3R)-2-[[6-amino-2-[(1S)-3-amino-1-[[(2S)-2,3-diamino-3-oxopropyl]amino]-3-oxopropyl]-5-methylpyrimidine-4-carbonyl]amino]-3-[3-[4-carbamoyloxy-3,5-dihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy-4,5-dihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy-3-(1H-imidazol-5-yl)propanoyl]amino]-3-hydroxy-2-methylpentanoyl]amino]-3-hydroxybutanoyl]amino]ethyl]-1,3-thiazol-4-yl]-1,3-thiazole-4-carbonyl]amino]propyl-dimethylsulfanium", "CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)C(=O)N[C@@H]([C@H](C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)N[C@H](C)[C@H]([C@H](C)C(=O)N[C@@H]([C@@H](C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O", np.nan, np.nan, "GDSC1"]

new_df[(new_df["drug_name"] == "Bleomycin (50 uM)") & (new_df["dataset"] == "GDSC2")] = ["Bleomycin (50 uM)", "Bleomycin (50 uM)", "3-[[2-[2-[2-[[(2S,3R)-2-[[(2S,3S,4R)-4-[[(2S,3R)-2-[[6-amino-2-[(1S)-3-amino-1-[[(2S)-2,3-diamino-3-oxopropyl]amino]-3-oxopropyl]-5-methylpyrimidine-4-carbonyl]amino]-3-[3-[4-carbamoyloxy-3,5-dihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy-4,5-dihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy-3-(1H-imidazol-5-yl)propanoyl]amino]-3-hydroxy-2-methylpentanoyl]amino]-3-hydroxybutanoyl]amino]ethyl]-1,3-thiazol-4-yl]-1,3-thiazole-4-carbonyl]amino]propyl-dimethylsulfanium", "CC1=C(N=C(N=C1N)[C@H](CC(=O)N)NC[C@@H](C(=O)N)N)C(=O)N[C@@H]([C@H](C2=CN=CN2)OC3C(C(C(C(O3)CO)O)O)OC4C(C(C(C(O4)CO)O)OC(=O)N)O)C(=O)N[C@H](C)[C@H]([C@H](C)C(=O)N[C@@H]([C@@H](C)O)C(=O)NCCC5=NC(=CS5)C6=NC(=CS6)C(=O)NCCC[S+](C)C)O", np.nan, np.nan, "GDSC2"]

new_df[new_df["drug_name"] == "KIN001-260"] = ["KIN001-260", "157400995", "potassium;azide;iodide", "[N-]=[N+]=[N-].[K+].[I-]", np.nan, np.nan, "GDSC1"]

new_df[new_df["drug_name"] == "BDILV000379a"] = ["BDILV000379a", "BDILV000379a", "4-(4-(1-methyl-1H-pyrazol-4-yl)-1-(2-(1-methyl-1H-pyrazol-4-yl)ethyl)-4,5-dihydro-1H-imidazol-5-yl)benzonitrile", "CN1N=CC(=C1)C1N=CN(C1C1=CC=C(C#N)C=C1)CCC=1C=NN(C1)C", np.nan, np.nan, "GDSC2"]

In [26]:
new_df = new_df.drop(columns=["dataset"])
mapping_dict.update(
    {
        "JNK-9L": "JNK inhibitor 9l",
        "KIN001-260": "Potassium;azide;iodide",
        "VNLG/124": "VNLG-124",
    }
)

In [27]:
# slashes are a problem
new_df["drug_name"] = new_df["drug_name"].astype(str)
new_df["pubchem_id"] = new_df["pubchem_id"].astype(str)
new_df["drug_name"] = new_df["drug_name"].apply(lambda x: x.replace("/", "-"))
new_df["pubchem_id"] = new_df["pubchem_id"].apply(lambda x: x.replace("/", "-"))

In [29]:
new_df.to_csv("all_smiles_old_names.csv", index=False)

In [30]:
# new names
new_df["drug_name"] = new_df["drug_name"].apply(lambda x: mapping_dict.get(x, x))

In [31]:
new_df = new_df.drop_duplicates()
new_df.to_csv("all_smiles.csv", index=False)

Now we change all response files: 
1. We make a dictionary name -> pubchem id
2. We change the names and pubchem ids in the response files

In [33]:
name_to_pubchem = dict(zip(new_df["drug_name"], new_df["pubchem_id"]))

In [34]:
import os
for dataset, file in all_datasets.items():
    print("Processing", dataset)
    df = pd.read_csv(file)
    df["drug_name"] = df["drug_name"].apply(lambda x: mapping_dict.get(x, x))
    df["pubchem_id"] = df["drug_name"].apply(lambda x: name_to_pubchem.get(x, x))
    df["drug_name"] = df["drug_name"].astype(str)
    df["pubchem_id"] = df["pubchem_id"].astype(str)
    df["drug_name"] = df["drug_name"].apply(lambda x: x.replace("/", "-"))
    df["pubchem_id"] = df["pubchem_id"].apply(lambda x: x.replace("/", "-"))
    df["Name"] = df["cellosaurus_id"] + "|" + df["pubchem_id"].astype(str)
    new_file = file.replace(".csv", "_new.csv")
    df.to_csv(new_file, index=False)
    df_drugs = df[['pubchem_id', 'drug_name']]
    df_drugs = df_drugs.drop_duplicates()
    dir = os.path.dirname(new_file)
    df_drugs.to_csv(f"{dir}/drug_names.csv", index=False)

Processing CCLE
Processing CTRPv1
Processing CTRPv2


  df = pd.read_csv(file)


Processing GDSC1
Processing GDSC2


  df = pd.read_csv(file)


In [35]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import numpy as np
import pandas as pd
# Load SMILES data
drugs = pd.read_csv("all_smiles.csv", index_col=1)

def smiles_to_demorgan_fingerprints(smiles_df, n_bits=128, radius=2):
    """
    Converts a list of SMILES strings to a list of DeMorgan fingerprints.
    Parameters:
        smiles_list (list of str): List of SMILES strings.
        n_bits (int): Number of bits for the fingerprint (default: 128).
        radius (int): Morgan fingerprint radius (default: 2).
    Returns:
        list: List of fingerprint bit arrays or 'NA' for invalid SMILES.
    """
    fingerprint_df = pd.DataFrame(columns=["pubchem_id", "fingerprint"])
    for pubchem_id, row in smiles_df.iterrows():
        smiles = row["canonical_smiles"]
        if not pd.isna(smiles):  
            mol = Chem.MolFromSmiles(smiles)
            mpfgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
            fp = mpfgen.GetFingerprint(mol).ToList()
        else:
            fp = [np.nan] * n_bits
        fingerprint_df = pd.concat([fingerprint_df, pd.DataFrame.from_dict({
            "pubchem_id": [pubchem_id],
            "fingerprint": [fp]
        })])
    return fingerprint_df

In [36]:
for n_bits in [64, 128, 256, 512, 1024, 2048]:
    fingerprints = smiles_to_demorgan_fingerprints(drugs, n_bits=n_bits)
    fingerprints = fingerprints.set_index("pubchem_id")
    # split list into columns
    fingerprints = pd.DataFrame(fingerprints["fingerprint"].tolist(), index=fingerprints.index)
    fingerprints = fingerprints[~fingerprints.index.duplicated(keep='first')]
    fingerprints = fingerprints.T
    # set everything to integers if it is not nan
    fingerprints = fingerprints.map(lambda x: x if pd.isna(x) else int(x))
    fingerprints.to_csv(f"pubchem_id_to_demorgan_{n_bits}_map.csv", index=False)

In [37]:
drug_to_dataset = {}
for dataset, file in all_datasets.items():
    parent = os.path.dirname(file)
    df = pd.read_csv(f"{parent}/drug_names.csv", index_col=0)
    for idx, row in df.iterrows():
        idx = str(idx)
        if drug_to_dataset.get(idx, None) is None:
            drug_to_dataset[idx] = [dataset]
        else:
            drug_to_dataset[idx].append(dataset)

In [38]:
# subset fingerprints
for dataset, file in all_datasets.items():
    parent = os.path.dirname(file)
    new_dir = f"{parent}/drug_fingerprints"
    for n_bits in [64, 128, 256, 512, 1024, 2048]:
        fingerprints = pd.read_csv(f"pubchem_id_to_demorgan_{n_bits}_map.csv")
        subset = [drug for drug in fingerprints.columns if drug_to_dataset.get(drug) is not None and dataset in drug_to_dataset[drug]]
        fingerprints = fingerprints[subset]
        fingerprints.to_csv(f"{new_dir}/pubchem_id_to_demorgan_{n_bits}_map.csv", index=False)

In [41]:
mapping_df = pd.DataFrame.from_dict(mapping_dict, orient="index", columns=["new_name"])
mapping_df.index.name = "old_name"
mapping_df = mapping_df[mapping_df["new_name"] != mapping_df.index]
mapping_df["pubchem_id"] = mapping_df["new_name"].apply(lambda x: name_to_pubchem.get(x, x))
mapping_df.to_csv("mapping_dict.csv")

In [8]:
# Ohno, : is also an issue in the drug name. This is an issue for CTRPv2, GDSC1, and TOYv1
for dataset in ["CTRPv2", "GDSC1", "TOYv1"]:
    print("Processing", dataset)
    response_file = f"/Users/judithbernett/PycharmProjects/drp_model_suite/data/{dataset}/{dataset}.csv"
    df = pd.read_csv(response_file, dtype={"pubchem_id": str, "drug_name": str})
    df["drug_name"] = df["drug_name"].apply(lambda x: x.replace(":", "-"))
    df["pubchem_id"] = df["pubchem_id"].apply(lambda x: x.replace(":", "-"))
    new_file = response_file.replace(".csv", "_new.csv")
    df.to_csv(new_file, index=False)
    
    drug_name_file = f"/Users/judithbernett/PycharmProjects/drp_model_suite/data/{dataset}/drug_names.csv"
    drug_names = pd.read_csv(drug_name_file, dtype={"pubchem_id": str, "drug_name": str})
    drug_names["drug_name"] = drug_names["drug_name"].apply(lambda x: x.replace(":", "-"))
    drug_names["pubchem_id"] = drug_names["pubchem_id"].apply(lambda x: x.replace(":", "-"))
    new_file = drug_name_file.replace(".csv", "_new.csv")
    drug_names.to_csv(new_file, index=False)
    
    fingerprint_dir = f"/Users/judithbernett/PycharmProjects/drp_model_suite/data/{dataset}/drug_fingerprints"
    for n_bits in [64, 128, 256, 512, 1024, 2048]:
        fingerprint_file = f"{fingerprint_dir}/pubchem_id_to_demorgan_{n_bits}_map.csv"
        fingerprints = pd.read_csv(fingerprint_file, dtype=str)
        fingerprints.columns = [col.replace(":", "-") for col in fingerprints.columns]
        new_file = fingerprint_file.replace(".csv", "_new.csv")
        fingerprints.to_csv(new_file, index=False)
    
    dipk_molgnet_dir = f"/Users/judithbernett/PycharmProjects/drp_model_suite/data/{dataset}/DIPK_features/Drugs"
    all_files = os.listdir(dipk_molgnet_dir)
    # if file name contains :, we need to change it
    for file in all_files:
        if ":" in file:
            new_file = file.replace(":", "-")
            os.rename(f"{dipk_molgnet_dir}/{file}", f"{dipk_molgnet_dir}/{new_file}")

Processing TOYv1
