# Query ChEMBL info for the prodrugs in the supplementary material

In [1]:
import pandas as pd
import sqlite3

In [2]:
chembl_version = "26"
base_path = "/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/"
path_results = base_path+"results/"
path_sqlite3_database = base_path+"data/chembl_"+chembl_version+"/chembl_"+chembl_version+"_sqlite/chembl_"+chembl_version+".db"

paper_path = base_path + 'data/jm1c00416_si_002.xlsx'
engine_ch = sqlite3.connect(path_sqlite3_database)

## List of prodrugs and active metabolites from the supplementary material

In [3]:
df_paper = pd.read_excel(open(paper_path, 'rb'), sheet_name='Manual curation_prodrugs')
df_paper

Unnamed: 0,CMPD_PREF_NAME (Active metabolite),"CMPD_PREF_NAME (Prodrug removed, potency data for active metabolite-drug target interaction used in analysis)",Unnamed: 2,Unnamed: 3,CMPD_PREF_NAME (Active metabolite).1,"CMPD_PREF_NAME (Prodrug removed, not used in analysis since no potency data for active metabolite-drug target interaction)"
0,"2',3'-DIDEOXYADENOSINE TRIPHOSPHATE",Didanosine,,,CARBOVIR TRIPHOSPHATE,Abacavir
1,7-ETHYL-10-HYDROXYCAMPTOTHECIN,Irinotecan,,,5-AZA-2′-DEOXYCYTIDINE TRIPHOSPHATE,Azacitidine
2,ABIRATERONE,Abiraterone acetate,,,DECITABINE TRIPHOSPHATE,Decitabine
3,AFIMOXIFENE,Tamoxifen,,,DIFLORASONE,Diflorasone diacetate
4,CANDESARTAN,Candesartan cilexetil,,,MOEXIPRILAT,Moexipril
5,CLOFIBRIC ACID,Clofibrate,,,PERINDOPRILAT,Perindopril
6,DABIGATRAN,Dabigatran etexilate,,,RAMIPRILAT,Ramipril
7,DESGLYMIDODRINE,Midodrine,,,TAZAROTENIC ACID,Tazarotene
8,DESLORATADINE,Loratidine,,,TRANDOLAPRILAT,Trandolapril
9,ENALAPRILAT,Enalapril,,,BENAZEPRILAT,Benazepril


In [4]:
prodrugs_1 = df_paper[['CMPD_PREF_NAME (Active metabolite)', 'CMPD_PREF_NAME (Prodrug removed,  potency data for active metabolite-drug target interaction used in analysis)']]
prodrugs_1 = prodrugs_1.rename(columns={'CMPD_PREF_NAME (Active metabolite)':'active_metabolite_pref_name',
                          'CMPD_PREF_NAME (Prodrug removed,  potency data for active metabolite-drug target interaction used in analysis)':'prodrug_pref_name'})
prodrugs_1['column'] = 1
prodrugs_1

Unnamed: 0,active_metabolite_pref_name,prodrug_pref_name,column
0,"2',3'-DIDEOXYADENOSINE TRIPHOSPHATE",Didanosine,1
1,7-ETHYL-10-HYDROXYCAMPTOTHECIN,Irinotecan,1
2,ABIRATERONE,Abiraterone acetate,1
3,AFIMOXIFENE,Tamoxifen,1
4,CANDESARTAN,Candesartan cilexetil,1
5,CLOFIBRIC ACID,Clofibrate,1
6,DABIGATRAN,Dabigatran etexilate,1
7,DESGLYMIDODRINE,Midodrine,1
8,DESLORATADINE,Loratidine,1
9,ENALAPRILAT,Enalapril,1


In [5]:
prodrugs_2 = df_paper[['CMPD_PREF_NAME (Active metabolite).1', 'CMPD_PREF_NAME (Prodrug removed, not used in analysis since no potency data for active metabolite-drug target interaction)']].dropna()
prodrugs_2 = prodrugs_2.rename(columns={'CMPD_PREF_NAME (Active metabolite).1': 'active_metabolite_pref_name',
                          'CMPD_PREF_NAME (Prodrug removed, not used in analysis since no potency data for active metabolite-drug target interaction)':'prodrug_pref_name'})
prodrugs_2['column'] = 2
prodrugs_2

Unnamed: 0,active_metabolite_pref_name,prodrug_pref_name,column
0,CARBOVIR TRIPHOSPHATE,Abacavir,2
1,5-AZA-2′-DEOXYCYTIDINE TRIPHOSPHATE,Azacitidine,2
2,DECITABINE TRIPHOSPHATE,Decitabine,2
3,DIFLORASONE,Diflorasone diacetate,2
4,MOEXIPRILAT,Moexipril,2
5,PERINDOPRILAT,Perindopril,2
6,RAMIPRILAT,Ramipril,2
7,TAZAROTENIC ACID,Tazarotene,2
8,TRANDOLAPRILAT,Trandolapril,2
9,BENAZEPRILAT,Benazepril,2


In [6]:
prodrugs = pd.concat([prodrugs_1, prodrugs_2])
prodrugs['prodrug_pref_name'] = prodrugs['prodrug_pref_name'].str.upper()
print("#produgs:", len(prodrugs))
prodrugs

#produgs: 56


Unnamed: 0,active_metabolite_pref_name,prodrug_pref_name,column
0,"2',3'-DIDEOXYADENOSINE TRIPHOSPHATE",DIDANOSINE,1
1,7-ETHYL-10-HYDROXYCAMPTOTHECIN,IRINOTECAN,1
2,ABIRATERONE,ABIRATERONE ACETATE,1
3,AFIMOXIFENE,TAMOXIFEN,1
4,CANDESARTAN,CANDESARTAN CILEXETIL,1
5,CLOFIBRIC ACID,CLOFIBRATE,1
6,DABIGATRAN,DABIGATRAN ETEXILATE,1
7,DESGLYMIDODRINE,MIDODRINE,1
8,DESLORATADINE,LORATIDINE,1
9,ENALAPRILAT,ENALAPRIL,1


## Compound information from ChEMBL26

In [7]:
sql = """
SELECT DISTINCT md.molregno as salt_molregno, md.pref_name as salt_pref_name, md.chembl_id as salt_chembl_id,
    md2.molregno as parent_molregno, md2.pref_name as parent_pref_name, md2.chembl_id as parent_chembl_id
FROM molecule_dictionary md
JOIN molecule_hierarchy mh 
    ON md.molregno = mh.molregno
JOIN molecule_dictionary md2 
    ON mh.parent_molregno = md2.molregno                           --parent_molregno
WHERE md.pref_name is not null
"""

df_mols = pd.read_sql_query(sql, con=engine_ch)
df_mols

Unnamed: 0,salt_molregno,salt_pref_name,salt_chembl_id,parent_molregno,parent_pref_name,parent_chembl_id
0,250,SB-203580,CHEMBL10,250,SB-203580,CHEMBL10
1,12356,LEVCROMAKALIM,CHEMBL100,12356,LEVCROMAKALIM,CHEMBL100
2,111185,CETIRIZINE,CHEMBL1000,111185,CETIRIZINE,CHEMBL1000
3,164444,AN-9,CHEMBL100014,164444,AN-9,CHEMBL100014
4,165474,PENTAZOCINE,CHEMBL100116,165474,PENTAZOCINE,CHEMBL100116
...,...,...,...,...,...,...
45523,164806,IMIDAPRILAT,CHEMBL99701,164806,IMIDAPRILAT,CHEMBL99701
45524,161939,PAULLONE,CHEMBL99779,161939,PAULLONE,CHEMBL99779
45525,110803,LORATADINE,CHEMBL998,110803,LORATADINE,CHEMBL998
45526,160881,8-AMINOQUINOLINE,CHEMBL99932,160881,8-AMINOQUINOLINE,CHEMBL99932


## Combine prodrug information with the ChEMBL information

In [8]:
df_combined = prodrugs.merge(df_mols, left_on='prodrug_pref_name', right_on='salt_pref_name', how="left")
print("Salts and parents are identical:", df_combined['salt_molregno'].equals(df_combined['parent_molregno']))
df_combined = df_combined.drop(['salt_molregno', 'salt_pref_name', 'salt_chembl_id'], axis=1)
df_combined = df_combined.rename(columns={'parent_molregno': 'prodrug_parent_molregno', 
                                            'parent_pref_name': 'prodrug_parent_pref_name',
                                            'parent_chembl_id': 'prodrug_parent_chembl_id'})
df_combined

Salts and parents are identical: True


Unnamed: 0,active_metabolite_pref_name,prodrug_pref_name,column,prodrug_parent_molregno,prodrug_parent_pref_name,prodrug_parent_chembl_id
0,"2',3'-DIDEOXYADENOSINE TRIPHOSPHATE",DIDANOSINE,1,390877.0,DIDANOSINE,CHEMBL1460
1,7-ETHYL-10-HYDROXYCAMPTOTHECIN,IRINOTECAN,1,5985.0,IRINOTECAN,CHEMBL481
2,ABIRATERONE,ABIRATERONE ACETATE,1,438435.0,ABIRATERONE ACETATE,CHEMBL271227
3,AFIMOXIFENE,TAMOXIFEN,1,6968.0,TAMOXIFEN,CHEMBL83
4,CANDESARTAN,CANDESARTAN CILEXETIL,1,116349.0,CANDESARTAN CILEXETIL,CHEMBL1014
5,CLOFIBRIC ACID,CLOFIBRATE,1,16415.0,CLOFIBRATE,CHEMBL565
6,DABIGATRAN,DABIGATRAN ETEXILATE,1,561520.0,DABIGATRAN ETEXILATE,CHEMBL539697
7,DESGLYMIDODRINE,MIDODRINE,1,675163.0,MIDODRINE,CHEMBL1201212
8,DESLORATADINE,LORATIDINE,1,,,
9,ENALAPRILAT,ENALAPRIL,1,16847.0,ENALAPRIL,CHEMBL578


In [9]:
df_combined = df_combined.merge(df_mols, left_on='active_metabolite_pref_name', right_on='salt_pref_name', how="left")

print("Salts and parents are identical:", df_combined['salt_molregno'].equals(df_combined['parent_molregno']))
print("Salt and parent info are not identical: ")
df_combined[(~df_combined['salt_molregno'].isnull()) & (df_combined['salt_molregno'] != df_combined['parent_molregno'])]

Salts and parents are identical: False
Salt and parent info are not identical: 


Unnamed: 0,active_metabolite_pref_name,prodrug_pref_name,column,prodrug_parent_molregno,prodrug_parent_pref_name,prodrug_parent_chembl_id,salt_molregno,salt_pref_name,salt_chembl_id,parent_molregno,parent_pref_name,parent_chembl_id
11,ENALAPRILAT,ENALAPRIL,1,16847.0,ENALAPRIL,CHEMBL578,2197277.0,ENALAPRILAT,CHEMBL3989406,16759.0,ENALAPRILAT,CHEMBL577


In [10]:
df_combined = df_combined.drop(['salt_molregno', 'salt_pref_name', 'salt_chembl_id'], axis=1)
df_combined = df_combined.rename(columns={'parent_molregno': 'active_metabolite_parent_molregno', 
                                            'parent_pref_name': 'active_metabolite_parent_pref_name',
                                            'parent_chembl_id': 'active_metabolite_parent_chembl_id'})
df_combined = df_combined.drop_duplicates()
print("Info about prodrugs:", len(df_combined), "(for", len(prodrugs), "prodrugs)")
df_combined

Info about prodrugs: 58 (for 56 prodrugs)


Unnamed: 0,active_metabolite_pref_name,prodrug_pref_name,column,prodrug_parent_molregno,prodrug_parent_pref_name,prodrug_parent_chembl_id,active_metabolite_parent_molregno,active_metabolite_parent_pref_name,active_metabolite_parent_chembl_id
0,"2',3'-DIDEOXYADENOSINE TRIPHOSPHATE",DIDANOSINE,1,390877.0,DIDANOSINE,CHEMBL1460,323638.0,"2',3'-DIDEOXYADENOSINE TRIPHOSPHATE",CHEMBL1383
1,7-ETHYL-10-HYDROXYCAMPTOTHECIN,IRINOTECAN,1,5985.0,IRINOTECAN,CHEMBL481,61315.0,7-ETHYL-10-HYDROXYCAMPTOTHECIN,CHEMBL837
2,ABIRATERONE,ABIRATERONE ACETATE,1,438435.0,ABIRATERONE ACETATE,CHEMBL271227,422175.0,ABIRATERONE,CHEMBL254328
3,AFIMOXIFENE,TAMOXIFEN,1,6968.0,TAMOXIFEN,CHEMBL83,6341.0,AFIMOXIFENE,CHEMBL10041
4,AFIMOXIFENE,TAMOXIFEN,1,6968.0,TAMOXIFEN,CHEMBL83,1408997.0,AFIMOXIFENE,CHEMBL2137046
5,AFIMOXIFENE,TAMOXIFEN,1,6968.0,TAMOXIFEN,CHEMBL83,6402.0,AFIMOXIFENE,CHEMBL489
6,CANDESARTAN,CANDESARTAN CILEXETIL,1,116349.0,CANDESARTAN CILEXETIL,CHEMBL1014,116848.0,CANDESARTAN,CHEMBL1016
7,CLOFIBRIC ACID,CLOFIBRATE,1,16415.0,CLOFIBRATE,CHEMBL565,29390.0,CLOFIBRIC ACID,CHEMBL683
8,DABIGATRAN,DABIGATRAN ETEXILATE,1,561520.0,DABIGATRAN ETEXILATE,CHEMBL539697,75302.0,DABIGATRAN,CHEMBL48361
9,DESGLYMIDODRINE,MIDODRINE,1,675163.0,MIDODRINE,CHEMBL1201212,140006.0,DESGLYMIDODRINE,CHEMBL1076


## Save information to csv

In [11]:
df_combined.to_csv(path_results+'prodrugs.csv', index=False)