# Steps
- Extract drug names for prostate cancers - (TCGA_DESC = PRAD)
- Filter extracted drugs based on IC50 value
    - LN_IC50 <= 2.302 (10uM) - Why?
- Extract unique list of drugs, if not already done
- Find canonical SMILES of each drug from PubChem
- Standardize canonical SMILES using rdkit
- Export the processed file to reference_drug.ipynb notebook, follow the fingerprint construction pipeline.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('GDSC2_fitted_dose_response_27Oct23.csv')
print(df.shape)
df.head()

(242036, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.463887,0.93022,0.089052,0.433123
1,GDSC2,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-4.869455,0.61497,0.111351,-1.4211
2,GDSC2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.360586,0.791072,0.142855,-0.599569
3,GDSC2,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-5.04494,0.59266,0.135539,-1.516647
4,GDSC2,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.741991,0.734047,0.128059,-0.807232


In [3]:
df_prad = df.loc[df['TCGA_DESC']=='PRAD']
print(df_prad.shape)
df_prad.head()

(1676, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
167,GDSC2,343,15987973,905934,PC-3,SIDM00088,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.796951,0.861456,0.085586,0.25178
168,GDSC2,343,15988254,905935,DU-145,SIDM00120,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.918755,0.80363,0.103836,-0.359006
361,GDSC2,343,16039919,907788,LNCaP-Clone-FGC,SIDM00683,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.58924,0.745884,0.090794,-0.724064
608,GDSC2,343,16102293,924100,22RV1,SIDM00499,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.903667,0.783025,0.106837,-0.350791
863,GDSC2,343,16164935,1299075,VCaP,SIDM01077,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.911319,0.866654,0.061343,0.189511


In [4]:
df_prad_filtered = df_prad.loc[df_prad['LN_IC50']<=2.302]
print(df_prad_filtered.shape)
df_prad_filtered.head()

(495, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
167,GDSC2,343,15987973,905934,PC-3,SIDM00088,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.796951,0.861456,0.085586,0.25178
168,GDSC2,343,15988254,905935,DU-145,SIDM00120,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.918755,0.80363,0.103836,-0.359006
361,GDSC2,343,16039919,907788,LNCaP-Clone-FGC,SIDM00683,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.58924,0.745884,0.090794,-0.724064
608,GDSC2,343,16102293,924100,22RV1,SIDM00499,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.903667,0.783025,0.106837,-0.350791
863,GDSC2,343,16164935,1299075,VCaP,SIDM01077,PRAD,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.911319,0.866654,0.061343,0.189511


In [5]:
print(len(set(df_prad_filtered['DRUG_NAME'])))
print(set(df_prad_filtered['DRUG_NAME']))

152
{'CDK9_5576', 'BI-2536', 'N25720-51-A1', 'Lestaurtinib', 'GNE-317', 'Camptothecin', 'Dihydrorotenone', 'BDP-00009066', 'Talazoparib', 'ABT737', 'Afatinib', 'Bleomycin (50 uM)', 'Ipatasertib', 'Obatoclax Mesylate', 'Paclitaxel', 'VE-822', 'Staurosporine', 'Sepantronium bromide', 'AZD5582', 'Foretinib', 'AZD1332', 'BMS-536924', 'SGC0946', 'Dinaciclib', 'Vorinostat', 'Luminespib', 'Daporinad', 'Afuresertib', 'AZ6102', 'Sorafenib', 'AT13148', 'KRAS (G12C) Inhibitor-12', 'JQ1', 'Vinorelbine', '741909', 'AZD5438', 'Pevonedistat', 'Crizotinib', 'Oxaliplatin', 'AZD5363', 'AZD2014', 'Dasatinib', 'Irinotecan', 'Telomerase Inhibitor IX', 'AMG-319', 'Dactolisib', 'Pyridostatin', 'Cediranib', 'CPI-637', 'Entinostat', 'VSP34_8731', 'GSK343', 'Gallibiscoquinazole', 'NVP-ADW742', 'P22077', 'AZD7762', 'Ulixertinib', 'MK-1775', 'Alisertib', 'UMI-77', 'Nutlin-3a (-)', 'MK-2206', 'LMP744', 'AZD5991', 'AGK2', 'CT7033-2', 'ZM447439', 'Acetalax', '150412', 'POMHEX', 'Tanespimycin', 'Osimertinib', 'Bortez