# Steps
- Extract drug names for prostate cancers - (TCGA_DESC = PRAD)
- Filter extracted drugs based on IC50 value
    - MAX_CONC <= 10uM - Why?
- Extract unique list of drugs, if not already done
- Find canonical SMILES of each drug from PubChem
- Standardize canonical SMILES using rdkit
- Export the processed file to reference_drug.ipynb notebook, follow the fingerprint construction pipeline.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('GDSC2_fitted_dose_response_27Oct23.csv')
print(df.shape)
df.head()

(242036, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.463887,0.93022,0.089052,0.433123
1,GDSC2,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-4.869455,0.61497,0.111351,-1.4211
2,GDSC2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.360586,0.791072,0.142855,-0.599569
3,GDSC2,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-5.04494,0.59266,0.135539,-1.516647
4,GDSC2,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.741991,0.734047,0.128059,-0.807232


In [3]:
df_prad = df.loc[df['TCGA_DESC']=='PRAD']
print(df_prad.shape)
df_prad.head()

(1676, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
167,GDSC2,343,15987973,905934,PC-3,SIDM00088,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.796951,0.861456,0.085586,0.25178
168,GDSC2,343,15988254,905935,DU-145,SIDM00120,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.918755,0.80363,0.103836,-0.359006
361,GDSC2,343,16039919,907788,LNCaP-Clone-FGC,SIDM00683,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.58924,0.745884,0.090794,-0.724064
608,GDSC2,343,16102293,924100,22RV1,SIDM00499,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.903667,0.783025,0.106837,-0.350791
863,GDSC2,343,16164935,1299075,VCaP,SIDM01077,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.911319,0.866654,0.061343,0.189511


In [4]:
df_prad_filtered = df_prad.loc[df_prad['MAX_CONC']<=10]
print(df_prad_filtered.shape)
df_prad_filtered.head()

(1517, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
167,GDSC2,343,15987973,905934,PC-3,SIDM00088,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.796951,0.861456,0.085586,0.25178
168,GDSC2,343,15988254,905935,DU-145,SIDM00120,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.918755,0.80363,0.103836,-0.359006
361,GDSC2,343,16039919,907788,LNCaP-Clone-FGC,SIDM00683,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.58924,0.745884,0.090794,-0.724064
608,GDSC2,343,16102293,924100,22RV1,SIDM00499,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.903667,0.783025,0.106837,-0.350791
863,GDSC2,343,16164935,1299075,VCaP,SIDM01077,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.911319,0.866654,0.061343,0.189511


In [5]:
print(len(set(df_prad_filtered['DRUG_NAME'])))
print(set(df_prad_filtered['DRUG_NAME']))

247
{'JNK Inhibitor VIII', 'ZM447439', 'Vorinostat', '729189', '743380', '630600', 'GSK2606414', 'Tamoxifen', 'TW 37', 'Dabrafenib', 'Podophyllotoxin bromide', 'THR-103', 'Mitoxantrone', 'SB216763', 'Trametinib', 'AZD5363', 'AGI-5198', 'GSK2830371A', 'GSK2578215A', 'CPI-637', '50869', 'BIBR-1532', 'Nilotinib', 'Sinularin', 'BMS-536924', 'AZD5438', 'Entospletinib', 'AZ6102', 'GSK-LSD1', 'GSK3337463A', 'MN-64', 'BDP-00009066', 'Camptothecin', 'Lestaurtinib', 'IAP_5620', 'BMS-345541', 'Avagacestat', 'VTP-B', 'GSK343', 'N29087-69-1', 'LMB_AB3', 'N30652-18-1', 'KU-55933', 'TAF1_5496', 'Savolitinib', 'Obatoclax Mesylate', 'CZC24832', '123138', 'MG-132', 'CCT007093', 'ABT737', 'Erlotinib', 'Staurosporine', 'CDK9_5038', 'AZD1332', 'LMB_AB2', 'Rapamycin', 'Ruxolitinib', 'Sabutoclax', 'N27922-53-1', 'PD0325901', 'UMI-77', 'AMG-319', 'UNC0638', '765771', 'Mycophenolic acid', 'Vismodegib', 'Epirubicin', 'JQ1', 'LY2109761', 'BPD-00008900', 'VX-11e', 'GSK1904529A', 'Doramapimod', 'ERK_6604', 'HKMTI-

### There are two types of identifiers present for the drugs - name, cid
- Separate the two types - and extract the SMILES separately for the two groups

In [31]:
# https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/1,2,3,4,5/property/MolecularFormula,MolecularWeight,CanonicalSMILES/CSV
import requests
import xml.etree.ElementTree as ET

# Define the compound name
compound_name = "GSK2606414"

# Define the URL
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/CanonicalSMILES/XML"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Get the data from the response
    data = response.text

    # Parse the XML data
    root = ET.fromstring(data)

    # Find the CanonicalSMILES element and get its text
    canonical_smiles = root.find('.//{http://pubchem.ncbi.nlm.nih.gov/pug_rest}CanonicalSMILES').text

    print(canonical_smiles)
else:
    print(f"Request failed with status code {response.status_code}")

CN1C=C(C2=C(N=CN=C21)N)C3=CC4=C(C=C3)N(CC4)C(=O)CC5=CC(=CC=C5)C(F)(F)F


In [32]:
def name2smiles(name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/CanonicalSMILES/XML"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Get the data from the response
        data = response.text

        # Parse the XML data
        root = ET.fromstring(data)

        # Find the CanonicalSMILES element and get its text
        canonical_smiles = root.find('.//{http://pubchem.ncbi.nlm.nih.gov/pug_rest}CanonicalSMILES').text

        return canonical_smiles
    else:
        print(f"Request failed with status code {response.status_code}")
        return None

In [33]:
name2smiles('aspirin')

'CC(=O)OC1=CC=CC=C1C(=O)O'

In [34]:
def cid2smiles(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES/XML"

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Get the data from the response
        data = response.text

        # Parse the XML data
        root = ET.fromstring(data)

        # Find the CanonicalSMILES element and get its text
        canonical_smiles = root.find('.//{http://pubchem.ncbi.nlm.nih.gov/pug_rest}CanonicalSMILES').text

        return canonical_smiles
    else:
        print(f"Request failed with status code {response.status_code}")
        return None

In [35]:
cid2smiles('729189')

'CC1=NN(C2=C1C(=O)CC(C2)(C)C)C3=CC=C(C=C3)F'