# Notebook to Extract Drug-Target Interactions Based on ChEMBL Data

### Authors: Barbara Zdrazil, Lina Heinzke
### 12/2022

This notebook extracts data from ChEMBL in order to retrieve a data set for drug-target and clinical candidate-target associations as well as comparator compounds for the respective targets.

The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach:  
*Target-Based Evaluation of “Drug-Like” Properties and Ligand Efficiencies  
Paul D. Leeson, A. Patricia Bento, Anna Gaulton, Anne Hersey, Emma J. Manners, Chris J. Radoux, and Andrew R. Leach  
J. Med. Chem. 2021, 64, 11, 7210–7230  
[DOI: 10.1021/acs.jmedchem.1c00416](https://doi.org/10.1021/acs.jmedchem.1c00416)*


More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes


In [1]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Scaffolds import MurckoScaffold

In [2]:
# notebook settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

# Get Data From ChEMBL

In [3]:
chembl_version = '31'
base_path = '/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/'
path_results = base_path+'results/'
path_sqlite3_database = base_path+'data/chembl_'+chembl_version+'/chembl_'+chembl_version+'_sqlite/chembl_'+chembl_version+'.db'
chembl_con = sqlite3.connect(path_sqlite3_database)

Initial query for activities + related assay, mutation, target und docs information.

In [4]:
sql = '''
SELECT act.pchembl_value, 
    md.molregno as parent_molregno, md.chembl_id as parent_chemblid, md.pref_name as parent_pref_name,
    md.max_phase, md.first_approval, md.usan_year, md.black_box_warning, 
    md.prodrug, md.oral, md.parenteral, md.topical, 
    ass.assay_type, ass.tid, 
    vs.mutation,
    td.chembl_id as target_chembl_id, td.pref_name as target_pref_name, td.target_type, td.organism, 
    docs.year
FROM activities act
INNER JOIN molecule_hierarchy mh 
    ON act.molregno = mh.molregno         -- act.molregno = salt_molregno
INNER JOIN molecule_dictionary md
    ON mh.parent_molregno = md.molregno   -- compound information based on parent compound
INNER JOIN assays ass 
    ON  act.assay_id = ass.assay_id
LEFT JOIN variant_sequences vs
    ON ass.variant_id = vs.variant_id
INNER JOIN target_dictionary td
    ON ass.tid = td.tid
LEFT JOIN docs
    ON act.doc_id = docs.doc_id
WHERE act.pchembl_value is not null
    and act.potential_duplicate = 0
    and act.standard_relation = '='
    and data_validity_comment is null
    and td.tid <>22226                    -- exclude unchecked targets
    and td.target_type like '%PROTEIN%'
'''

df_mols = pd.read_sql_query(sql, con=chembl_con)
# target_id_mutation
df_mols['tid_mutation'] = np.where(df_mols['mutation'].notnull(), 
                                   df_mols['tid'].astype('str')+'_'+df_mols['mutation'], 
                                   df_mols['tid'].astype('str'))
# compound-target association
df_mols['cpd_target_pair'] = df_mols.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1)
df_mols

Unnamed: 0,pchembl_value,parent_molregno,parent_chemblid,parent_pref_name,max_phase,first_approval,usan_year,black_box_warning,prodrug,oral,parenteral,topical,assay_type,tid,mutation,target_chembl_id,target_pref_name,target_type,organism,year,tid_mutation,cpd_target_pair
0,5.40,252199,CHEMBL357278,,0,,,0,-1,0,0,0,B,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,2004.0,10483,252199_10483
1,4.77,253534,CHEMBL357119,,0,,,0,-1,0,0,0,B,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,2004.0,10483,253534_10483
2,6.75,253199,CHEMBL152968,,0,,,0,-1,0,0,0,B,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,2004.0,10483,253199_10483
3,5.22,253199,CHEMBL152968,,0,,,0,-1,0,0,0,A,12594,,CHEMBL3356,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,2004.0,12594,253199_12594
4,4.43,253199,CHEMBL152968,,0,,,0,-1,0,0,0,A,17045,,CHEMBL340,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,2004.0,17045,253199_17045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2672626,7.26,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,B,101602,,CHEMBL5907,Serine/threonine-protein kinase LATS2,SINGLE PROTEIN,Homo sapiens,2021.0,101602,2408605_101602
2672627,7.01,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,B,10811,,CHEMBL3231,Rho-associated protein kinase 1,SINGLE PROTEIN,Homo sapiens,2021.0,10811,2408605_10811
2672628,7.09,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,B,11149,,CHEMBL2973,Rho-associated protein kinase 2,SINGLE PROTEIN,Homo sapiens,2021.0,11149,2408605_11149
2672629,7.27,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,B,100075,,CHEMBL4267,TGF-beta receptor type II,SINGLE PROTEIN,Homo sapiens,2021.0,100075,2408605_100075


Set correct types.

In [5]:
df_mols = df_mols.astype({
    'year': 'Int64',
    'usan_year': 'Int64',
    'first_approval': 'Int64'
})

In [6]:
# df_mols.to_csv(path_results+"ChEMBL"+chembl_version+"_initial_query.csv", sep = ';', index = False)

In [7]:
############### TESTING: method to save dataset size at any given point to array ###############
# assay with sizes of full dataset
all_lengths = []
# assay with sizes of dataset with pchembl values
all_lengths_pchembl = []

def calculate_dataset_sizes(data):
    now_mols = len(set(data["parent_molregno"]))
    now_targets = len(set(data["tid"]))
    now_targets_mutation = len(set(data["tid_mutation"]))
    now_pairs = len(set(data['cpd_target_pair']))
    now_pairs_mutation = len(set(data['parent_molregno_tid_mutation']))
    
    if 'DTI' in data.columns:
        data_drugs = data[data["DTI"] == "D_DT"]
    else: 
        data_drugs = data[data["max_phase"] == 4]
        
    now_drugs = len(set(data_drugs["parent_molregno"]))
    now_drug_targets = len(set(data_drugs["tid"]))
    now_drug_targets_mutation = len(set(data_drugs["tid_mutation"]))
    now_drug_pairs = len(set(data_drugs['cpd_target_pair']))
    now_drug_pairs_mutation = len(set(data_drugs['parent_molregno_tid_mutation']))
    

    return [now_mols, now_drugs, 
            now_targets, now_drug_targets,
            now_targets_mutation, now_drug_targets_mutation,
            now_pairs, now_drug_pairs,
            now_pairs_mutation, now_drug_pairs_mutation]

def add_dataset_sizes(data, label, output=False):
    data_test = data.copy()
    data_test['parent_molregno_tid_mutation'] = data_test.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1)
    
    all_lengths.append([label] + calculate_dataset_sizes(data_test))
    
    # restrict to data with pchembl value
    if 'pchembl_value' in data_test.columns:
        data_pchembl = data_test[~data_test['pchembl_value'].isnull()]
    else:
        data_pchembl = data_test[~data_test['pchembl_value_mean'].isnull()]
    all_lengths_pchembl.append([label] + calculate_dataset_sizes(data_pchembl))

In [8]:
############### TESTING: initial query ###############
add_dataset_sizes(df_mols, "init", True)

# Calculate Mean, Median, and Max *pchembl* Values for Each Compound-Target Pair

The following values are set to summarise the information for compound-target pairs:  

|||
| :----------- | :----------- |
| *pchembl_value_mean* | mean pchembl value for a compound-target pair|
| *pchembl_value_max*| maximum pchembl value for a compound-target pair|
| *pchembl_value_median*| median pchembl value for a compound-target pair|
| *first_publication_cpd_target_pair* | first publication in ChEMBL with this compound-target pair |
| *first_publication_cpd_target_pair_w_pchembl* | first publication in ChEMBL with this compound-target pair and an associated pchembl value |

The values are set for a table with binding and functional assay data and another table with only binding assay data. These tables are combined into one table for further handling and can be distinguished by the parameter only_binding (binding and functional assay data = False; only binding data = True).

In [9]:
# summarise the information for binding and functional assays
df_mols_all = df_mols[(df_mols['assay_type'] == 'B') | (df_mols['assay_type'] == 'F')].copy()
df_mols_all['pchembl_value_mean'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
df_mols_all['pchembl_value_max'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
df_mols_all['pchembl_value_median'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
df_mols_all['first_publication_cpd_target_pair'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')
df_mols_all_first_publication_pchembl = df_mols_all[~df_mols_all['pchembl_value'].isnull()].groupby(['parent_molregno', 'tid_mutation'])['year'].min().reset_index().rename(columns={"year": "first_publication_cpd_target_pair_w_pchembl"})
df_mols_all = df_mols_all.merge(df_mols_all_first_publication_pchembl, on=['parent_molregno', 'tid_mutation'], how='left')

In [10]:
# summarise the information for only binding assays
df_mols_binding = df_mols[df_mols['assay_type'] == 'B'].copy()
df_mols_binding['pchembl_value_mean'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
df_mols_binding['pchembl_value_max'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
df_mols_binding['pchembl_value_median'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
df_mols_binding['first_publication_cpd_target_pair'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')
df_mols_binding_first_publication_pchembl = df_mols_binding[~df_mols_binding['pchembl_value'].isnull()].groupby(['parent_molregno', 'tid_mutation'])['year'].min().reset_index().rename(columns={"year": "first_publication_cpd_target_pair_w_pchembl"})
df_mols_binding = df_mols_binding.merge(df_mols_binding_first_publication_pchembl, on=['parent_molregno', 'tid_mutation'], how='left')

In [11]:
# combine the table based on binding and functional assays (only_binding = False)
# and the table based on only binding assays (only_binding = True)
df_mols_all['only_binding'] = False
df_mols_binding['only_binding'] = True
df_combined = pd.concat([df_mols_all, df_mols_binding])
# drop the information used for the aggregation of values
df_combined = df_combined.drop(columns=['pchembl_value', 'year', 'assay_type']).drop_duplicates()
df_combined

Unnamed: 0,parent_molregno,parent_chemblid,parent_pref_name,max_phase,first_approval,usan_year,black_box_warning,prodrug,oral,parenteral,topical,tid,mutation,target_chembl_id,target_pref_name,target_type,organism,tid_mutation,cpd_target_pair,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_cpd_target_pair,first_publication_cpd_target_pair_w_pchembl,only_binding
0,252199,CHEMBL357278,,0,,,0,-1,0,0,0,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,10483,252199_10483,5.40,5.40,5.40,2004,2004,False
1,253534,CHEMBL357119,,0,,,0,-1,0,0,0,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,10483,253534_10483,4.77,4.77,4.77,2004,2004,False
2,253199,CHEMBL152968,,0,,,0,-1,0,0,0,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,10483,253199_10483,6.75,6.75,6.75,2004,2004,False
3,933,CHEMBL268439,,0,,,0,-1,0,0,0,10989,,CHEMBL2186,Carbonic anhydrase XIII,SINGLE PROTEIN,Mus musculus,10989,933_10989,8.70,8.70,8.70,2004,2004,False
4,82960,CHEMBL54530,,0,,,0,-1,0,0,0,11643,,CHEMBL4320,DNA topoisomerase III,SINGLE PROTEIN,Bacillus subtilis (strain 168),11643,82960_11643,4.72,4.72,4.72,1984,1984,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392378,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,101602,,CHEMBL5907,Serine/threonine-protein kinase LATS2,SINGLE PROTEIN,Homo sapiens,101602,2408605_101602,7.26,7.26,7.26,2021,2021,True
1392379,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,10811,,CHEMBL3231,Rho-associated protein kinase 1,SINGLE PROTEIN,Homo sapiens,10811,2408605_10811,7.01,7.01,7.01,2021,2021,True
1392380,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,11149,,CHEMBL2973,Rho-associated protein kinase 2,SINGLE PROTEIN,Homo sapiens,11149,2408605_11149,7.09,7.09,7.09,2021,2021,True
1392381,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,100075,,CHEMBL4267,TGF-beta receptor type II,SINGLE PROTEIN,Homo sapiens,100075,2408605_100075,7.27,7.27,7.27,2021,2021,True


# Extract Drug-Target Interactions With Disease Relevance From the drug_mechanism Table

Extract the known drug-target interactions from ChEMBL (these include some interactions between compounds with a max_phase < 4 and targets). These will be used to determine if drug-target pairs from the activities query above are known drug-target interactions. 

Note: Compound-target pairs can be in the drug_mechanisms table even though the compound is not a drug (max_phase < 4). For ease of writing, these will be referred to as drug-target interactions as well rather than compound-target pairs with a known disease-relevant interaction. 

Only entries with a disease_efficacy of 1 are taken into account, i.e., the target is believed to play a role in the efficacy of the drug.  
disease_efficacy: Flag to show whether the target assigned is believed to play a role in the efficacy of the drug in the indication(s) for which it is approved (1 = yes, 0 = no).

In [12]:
sql = '''
SELECT DISTINCT mh.parent_molregno, dm.tid
FROM drug_mechanism dm
INNER JOIN molecule_hierarchy mh
    ON dm.molregno = mh.molregno
INNER JOIN molecule_dictionary md
    ON mh.parent_molregno = md.molregno
WHERE dm.disease_efficacy = 1
    and dm.tid is not null
'''

df_dti = pd.read_sql_query(sql, con=chembl_con)
df_dti

Unnamed: 0,parent_molregno,tid
0,1124,11060
1,675068,10193
2,1125,10193
3,1085,10193
4,1124,10193
...,...,...
5909,2486701,120084
5910,2335784,120086
5911,2336055,106147
5912,2158101,10967


Query target_relations for related target ids to increase the number of target ids for which there is data in the drug_mechanisms table.
The following mappings are considered:

||||
|:------|:-----:|-----|
|protein family |-[superset of]->| single protein|
|protein complex |-[superset of]->| single protein|
|protein complex group |-[superset of]->| single protein|
|single protein |-[equivalent to]->| single protein|
|chimeric protein |-[superset of]->| single protein|
|protein-protein interaction |-[superset of]->| single protein|

For example, for *protein family -[superset of]-> single protein* this means:  
If there is a known relevant interaction between a compound and a protein family, interactions between the compound and single proteins of that protein family are considered to be known interactions as well.

In [13]:
sql = '''
SELECT tr.tid, tr.relationship, tr.related_tid, 
    td1.pref_name as pref_name_1, td1.target_type as target_type_1, td1.organism as organism_1, 
    td2.pref_name as pref_name_2, td2.target_type as target_type_2, td2.organism as organism_2 
FROM target_relations tr
INNER JOIN target_dictionary td1
    ON tr.tid = td1.tid
INNER JOIN target_dictionary td2
    ON tr.related_tid = td2.tid
'''

df_related_targets = pd.read_sql_query(sql, con=chembl_con)
df_related_targets.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2
0,11699,SUBSET OF,104812,PI4-kinase type II,SINGLE PROTEIN,Homo sapiens,"Phosphatidylinositol 4-kinase, PI4K",PROTEIN FAMILY,Homo sapiens
1,12261,SUBSET OF,104822,c-Jun N-terminal kinase 1,SINGLE PROTEIN,Homo sapiens,"c-Jun N-terminal kinase, JNK",PROTEIN FAMILY,Homo sapiens
2,12261,SUBSET OF,118329,c-Jun N-terminal kinase 1,SINGLE PROTEIN,Homo sapiens,Mitogen-activated protein kinase 8/9,PROTEIN FAMILY,Homo sapiens
3,12755,SUBSET OF,104684,Dopamine D5 receptor,SINGLE PROTEIN,Rattus norvegicus,Dopamine receptor,PROTEIN FAMILY,Rattus norvegicus
4,12735,SUBSET OF,105018,Phosphorylase kinase gamma subunit 2,SINGLE PROTEIN,Homo sapiens,Phosphorylase kinase,PROTEIN COMPLEX GROUP,Homo sapiens


In [14]:
protein_family_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN FAMILY") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_group_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX GROUP") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

single_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "SINGLE PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "EQUIVALENT TO")]

chimeric_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "CHIMERIC PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

ppi_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN-PROTEIN INTERACTION") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

relevant_mappings = pd.concat([protein_family_mapping, 
                               protein_complex_mapping, 
                               protein_complex_group_mapping,
                               single_protein_mapping, 
                               chimeric_protein_mapping, 
                               ppi_mapping])
relevant_mappings.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2
269,104829,SUPERSET OF,11111,Cyclooxygenase,PROTEIN FAMILY,Bos taurus,Cyclooxygenase-2,SINGLE PROTEIN,Bos taurus
270,104829,SUPERSET OF,17019,Cyclooxygenase,PROTEIN FAMILY,Bos taurus,Cyclooxygenase-1,SINGLE PROTEIN,Bos taurus
273,104745,SUPERSET OF,10868,Leukotriene B4 receptor,PROTEIN FAMILY,Homo sapiens,Leukotriene B4 receptor 2,SINGLE PROTEIN,Homo sapiens
274,104745,SUPERSET OF,10542,Leukotriene B4 receptor,PROTEIN FAMILY,Homo sapiens,Leukotriene B4 receptor 1,SINGLE PROTEIN,Homo sapiens
288,104699,SUPERSET OF,12854,Adenosine A2 receptor,PROTEIN FAMILY,Rattus norvegicus,Adenosine A2b receptor,SINGLE PROTEIN,Rattus norvegicus


Combine the drug-target-interactions (DTI) and target ids (dti_tids) from the drug mechanism table with the information based on the mapped target ids.

In [15]:
# drug-target-interactions (DTI) and target ids (dti_tids) based on the drug_mechanisms table
DTIs_original = set(df_dti.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
dti_tids_original = set(df_dti['tid'])

# drug-target-interactions (DTI) and target ids (dti_tids) based on mapped target ids
df_dti_add_targets = df_dti.merge(relevant_mappings, on = 'tid', how = 'inner')
DTIs_mapped = set(df_dti_add_targets.agg('{0[parent_molregno]}_{0[related_tid]}'.format, axis=1))
dti_tids_mapped = set(df_dti_add_targets['related_tid'].astype("int"))

# combined drug-target-interactions (DTI) and target ids (dti_tids) 
# based on drug_mechanisms table and mapped target ids
DTIs_set = DTIs_original.union(DTIs_mapped)
dti_tids_set = dti_tids_original.union(dti_tids_mapped)

# DTI (Drug-Target Interaction) Annotations

Every compound-target pair is assigned a DTI (drug target interaction) annotation.  

The assignement is based on three questions:
- Is the compound-target pair in the drug_mechanisms table? = Is it a known relevant compound-target interaction?
- What is the max_phase of the compound? = Is it a drug / clinical compound?
- Is the target in the drug_mechanisms table = Is it a therapeutic target?

The assigments are based on the following table:

|in drug_mechanisms table?|max_phase?|therapeutic target?|DTI annotation|explanation|
|:-----:|:-----:|:-----:|:-----:|:-----|
|yes|4|-|D_DT|drug - drug target|
|yes|3|-|C3_DT|clinical candidate in phase 3 - drug target|
|yes|2|-|C2_DT|clinical candidate in phase 2 - drug target|
|yes|1|-|C1_DT|clinical candidate in phase 1 - drug target|
|yes|0|-|C0_DT|compound in phase 0 - drug target|
|no|-|yes|DT|drug target|
|no|-|no|NDT|not drug target|




Identify which targets are therapeutic targets (= are they in the drug_mechanism table?) and add the field *therapeutic_target* that indicates whether target is a known therapeutic target.  

In [16]:
df_combined['therapeutic_target'] = df_combined['tid'].isin(dti_tids_set)

Assign the annotations based on the table.

In [17]:
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 4)), 'DTI'] = "D_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 3)), 'DTI'] = "C3_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 2)), 'DTI'] = "C2_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 1)), 'DTI'] = "C1_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 0)), 'DTI'] = "C0_DT"
df_combined.loc[((~df_combined['cpd_target_pair'].isin(DTIs_set)) 
                 & (df_combined['therapeutic_target'] == True)), 'DTI'] = "DT"
# if target is not a therapeutic target, 'cpd_target_pair' cannot be in DTIs_set
# (~df_combined['cpd_target_pair'].isin(DTIs_set)) is included for clarity
df_combined.loc[((~df_combined['cpd_target_pair'].isin(DTIs_set)) 
                 & (df_combined['therapeutic_target'] == False)), 'DTI'] = "NDT"

In [18]:
############### TESTING: before discarding NDT rows ###############
add_dataset_sizes(df_combined, "pre DTI")

Discard rows that were annotated with NDT, i.e., compound-target pairs that are not in the drug_mechanisms table and for which the target was also not in the drug_mechanisms table (not a comparator compound).

In [19]:
# discard NDT rows
df_combined = df_combined[(df_combined['DTI'].isin(['D_DT', 'C3_DT', 'C2_DT', 'C1_DT', 'C0_DT', 'DT']))]

Keep only compounds with pchembl_value or compound target pairs with known interaction, 
i.e. compound-target pairs that are in the drug_mechanism table are not required to have a pchembl value.

In [20]:
# TODO
# add compounds from drug_mechanism table, not requiring them to have a pchembl value
print(df_combined.columns)
df_combined

Index(['parent_molregno', 'parent_chemblid', 'parent_pref_name', 'max_phase',
       'parenteral', 'topical', 'tid', 'mutation', 'target_chembl_id',
       'target_pref_name', 'target_type', 'organism', 'tid_mutation',
       'cpd_target_pair', 'pchembl_value_mean', 'pchembl_value_max',
       'pchembl_value_median', 'first_publication_cpd_target_pair',
       'first_publication_cpd_target_pair_w_pchembl', 'only_binding',
       'therapeutic_target', 'DTI'],
      dtype='object')


Unnamed: 0,parent_molregno,parent_chemblid,parent_pref_name,max_phase,first_approval,usan_year,black_box_warning,prodrug,oral,parenteral,topical,tid,mutation,target_chembl_id,target_pref_name,target_type,organism,tid_mutation,cpd_target_pair,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_cpd_target_pair,first_publication_cpd_target_pair_w_pchembl,only_binding,therapeutic_target,DTI
9,100708,CHEMBL305153,,0,,,0,-1,0,0,0,50,,CHEMBL213,Beta-1 adrenergic receptor,SINGLE PROTEIN,Homo sapiens,50,100708_50,7.53,7.53,7.53,1982,1982,False,True,DT
18,1798744,CHEMBL3350133,,0,,,0,-1,0,0,0,136,,CHEMBL236,Delta opioid receptor,SINGLE PROTEIN,Homo sapiens,136,1798744_136,6.96,6.96,6.96,1986,1986,False,True,DT
20,88622,CHEMBL57825,,0,,,0,-1,0,0,0,248,,CHEMBL1835,Thromboxane-A synthase,SINGLE PROTEIN,Homo sapiens,248,88622_248,6.00,6.00,6.00,1987,1987,False,True,DT
24,97517,CHEMBL303519,,0,,,0,-1,0,0,0,72,,CHEMBL217,Dopamine D2 receptor,SINGLE PROTEIN,Homo sapiens,72,97517_72,5.01,5.01,5.01,1998,1998,False,True,DT
25,97517,CHEMBL303519,,0,,,0,-1,0,0,0,130,,CHEMBL234,Dopamine D3 receptor,SINGLE PROTEIN,Homo sapiens,130,97517_130,5.25,5.25,5.25,1998,1998,False,True,DT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392370,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,12829,,CHEMBL3055,Cyclin-dependent kinase 7,SINGLE PROTEIN,Homo sapiens,12829,2408605_12829,6.64,6.64,6.64,2021,2021,True,True,DT
1392372,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,11639,,CHEMBL3385,MAP kinase ERK1,SINGLE PROTEIN,Homo sapiens,11639,2408605_11639,8.82,8.82,8.82,2021,2021,True,True,DT
1392373,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,11638,,CHEMBL4040,MAP kinase ERK2,SINGLE PROTEIN,Homo sapiens,11638,2408605_11638,8.32,8.32,8.32,2021,2021,True,True,DT
1392379,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,10811,,CHEMBL3231,Rho-associated protein kinase 1,SINGLE PROTEIN,Homo sapiens,10811,2408605_10811,7.01,7.01,7.01,2021,2021,True,True,DT


In [21]:
############### TESTING: after discarding NDT rows ###############
add_dataset_sizes(df_combined, "post DTI")

# Add Compound Properties Based on ChEMBL Data

## Add First Appearance of Compound in the Literature

Query and calculate the first appearance of a compound in the literature based on ChEMBL data.

In [22]:
# first appearance of a compound in the literature 
# information about salts is aggregated in the parent
sql = '''
SELECT DISTINCT docs.year, mh.parent_molregno
FROM docs
LEFT JOIN compound_records cr
    ON docs.doc_id = cr.doc_id
INNER JOIN molecule_hierarchy mh 
    ON cr.molregno = mh.molregno   -- cr.molregno = salt_molregno
WHERE docs.year is not null
'''

df_docs = pd.read_sql_query(sql, con=chembl_con)
df_docs['first_publication_cpd'] = df_docs.groupby('parent_molregno')['year'].transform('min')
df_docs = df_docs[['parent_molregno', 'first_publication_cpd']].drop_duplicates()
df_docs

Unnamed: 0,parent_molregno,first_publication_cpd
0,4941,1974
1,921,1974
2,1005421,1976
3,1750777,1976
4,1750778,1976
...,...,...
2008029,2658656,2022
2008033,2708719,2022
2008035,2587403,2022
2008036,2574142,2022


Combine with previous data.

In [23]:
df_combined = df_combined.merge(df_docs, on = 'parent_molregno', how='left')
df_combined = df_combined.astype({'first_publication_cpd': 'Int64'})
df_combined

Unnamed: 0,parent_molregno,parent_chemblid,parent_pref_name,max_phase,first_approval,usan_year,black_box_warning,prodrug,oral,parenteral,topical,tid,mutation,target_chembl_id,target_pref_name,target_type,organism,tid_mutation,cpd_target_pair,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_cpd_target_pair,first_publication_cpd_target_pair_w_pchembl,only_binding,therapeutic_target,DTI,first_publication_cpd
0,100708,CHEMBL305153,,0,,,0,-1,0,0,0,50,,CHEMBL213,Beta-1 adrenergic receptor,SINGLE PROTEIN,Homo sapiens,50,100708_50,7.53,7.53,7.53,1982,1982,False,True,DT,1982
1,1798744,CHEMBL3350133,,0,,,0,-1,0,0,0,136,,CHEMBL236,Delta opioid receptor,SINGLE PROTEIN,Homo sapiens,136,1798744_136,6.96,6.96,6.96,1986,1986,False,True,DT,1986
2,88622,CHEMBL57825,,0,,,0,-1,0,0,0,248,,CHEMBL1835,Thromboxane-A synthase,SINGLE PROTEIN,Homo sapiens,248,88622_248,6.00,6.00,6.00,1987,1987,False,True,DT,1987
3,97517,CHEMBL303519,,0,,,0,-1,0,0,0,72,,CHEMBL217,Dopamine D2 receptor,SINGLE PROTEIN,Homo sapiens,72,97517_72,5.01,5.01,5.01,1998,1998,False,True,DT,1998
4,97517,CHEMBL303519,,0,,,0,-1,0,0,0,130,,CHEMBL234,Dopamine D3 receptor,SINGLE PROTEIN,Homo sapiens,130,97517_130,5.25,5.25,5.25,1998,1998,False,True,DT,1998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772012,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,12829,,CHEMBL3055,Cyclin-dependent kinase 7,SINGLE PROTEIN,Homo sapiens,12829,2408605_12829,6.64,6.64,6.64,2021,2021,True,True,DT,2021
1772013,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,11639,,CHEMBL3385,MAP kinase ERK1,SINGLE PROTEIN,Homo sapiens,11639,2408605_11639,8.82,8.82,8.82,2021,2021,True,True,DT,2021
1772014,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,11638,,CHEMBL4040,MAP kinase ERK2,SINGLE PROTEIN,Homo sapiens,11638,2408605_11638,8.32,8.32,8.32,2021,2021,True,True,DT,2021
1772015,2408605,CHEMBL4538174,ERKi,0,,,0,-1,0,0,0,10811,,CHEMBL3231,Rho-associated protein kinase 1,SINGLE PROTEIN,Homo sapiens,10811,2408605_10811,7.01,7.01,7.01,2021,2021,True,True,DT,2021


## Add ChEMBL Compound Properties and Compound Structures

Add compound properties and structures based on the compound_properties table and the compound_structures table. 

In [24]:
sql = '''
SELECT DISTINCT mh.parent_molregno, 
    cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
    cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
    cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
    cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, 
    struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
FROM compound_properties cp
INNER JOIN molecule_hierarchy mh
    ON cp.molregno = mh.parent_molregno
INNER JOIN compound_structures struct
    ON mh.parent_molregno = struct.molregno
'''

df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
df_cpd_props.head()

Unnamed: 0,parent_molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,2657369,411.94,3.82,4.0,2.0,64.26,5.0,N,0.0,11.94,6.11,3.98,3.96,NEUTRAL,411.94,3.0,29.0,0.67,411.1826,C22H26ClN5O,6.0,2.0,0.0,InChI=1S/C22H26ClN5O/c1-27(2)18-6-3-15(4-7-18)14-24-21(29)16-9-11-28(12-10-16)22-25-19-8-5-17(23...,AAAADVYFXUUVEO-UHFFFAOYSA-N,CN(C)c1ccc(CNC(=O)C2CCN(c3nc4ccc(Cl)cc4[nH]3)CC2)cc1
1,477782,506.37,3.04,8.0,2.0,116.43,8.0,N,1.0,,6.5,2.16,2.11,NEUTRAL,506.37,2.0,27.0,0.53,506.0485,C17H23IN4O4S,8.0,3.0,1.0,"InChI=1S/C17H23IN4O4S/c1-10(2)11-7-14(25-3)12(18)8-13(11)26-15-9-21-17(22-16(15)19)20-5-6-27(4,2...",AAAAEENPAALFRN-UHFFFAOYSA-N,COc1cc(C(C)C)c(Oc2cnc(NCCS(C)(=O)=O)nc2N)cc1I
2,2237474,927.28,7.03,11.0,7.0,252.91,41.0,N,4.0,4.13,,8.43,5.36,ACID,927.28,0.0,65.0,0.02,926.6555,C49H90N4O12,16.0,8.0,4.0,InChI=1S/C49H90N4O12/c1-5-8-10-12-14-16-18-20-22-24-26-28-30-37(31-29-27-25-23-21-19-17-15-13-11...,AAAAJHGLNDAXFP-VNKVACROSA-N,CCCCCCCCCCCCCCC(CCCCCCCCCCCCCC)C(=O)OC[C@H]1OC(O)[C@H](NC(C)=O)[C@@H](OCC(=O)N[C@@H](CC)C(=O)N[C...
3,412019,271.32,1.72,2.0,2.0,65.2,1.0,N,0.0,13.43,,0.77,0.77,NEUTRAL,271.32,2.0,20.0,0.83,271.1321,C15H17N3O2,5.0,2.0,0.0,"InChI=1S/C15H17N3O2/c1-8-7-16-14(19)13-12(8)10-6-9(15(20)18(2)3)4-5-11(10)17-13/h4-6,8,17H,7H2,1...",AAAAKTROWFNLEP-UHFFFAOYSA-N,CC1CNC(=O)c2[nH]c3ccc(C(=O)N(C)C)cc3c21
4,26284,323.35,2.13,4.0,1.0,71.53,3.0,N,0.0,,4.73,1.13,1.13,NEUTRAL,323.35,2.0,24.0,0.94,323.127,C18H17N3O3,6.0,1.0,0.0,InChI=1S/C18H17N3O3/c1-11(22)20-10-17-16-8-14-7-12(13-3-2-6-19-9-13)4-5-15(14)21(16)18(23)24-17/...,AAAATQFUBIBQIS-IRXDYDNUSA-N,CC(=O)NC[C@@H]1OC(=O)N2c3ccc(-c4cccnc4)cc3C[C@@H]12


Combine with previous data.

In [25]:
df_combined = df_combined.merge(df_cpd_props, on = 'parent_molregno', how = 'inner')

In [26]:
############### TESTING: compound props ###############
add_dataset_sizes(df_combined, "cpd props")

## Calculate Ligand Efficiency (LE) Metrics

Calculate the ligand efficiency metrics for the compounds based on the mean pchembl values for a compound-target pair and the following ligand efficiency (LE) formulas:

$\text{LE} = \frac{\Delta\text{G}}{\text{HA}}$
where $ \Delta\text{G} = − RT \ln(K_d)$, $− RT\ln(K_i)$, or $− RT\ln(IC_{50})$

$\text{LE}=\frac{(2.303 \cdot 298 \cdot 0.00199 \cdot \text{pchembl_value})} {\text{heavy_atoms}}$


$\text{BEI}=\frac{\text{pchembl_mean} \cdot 1000} {\text{mw_freebase}}$

$\text{SEI}=\frac{\text{pchembl_mean} \cdot 100} {\text{PSA}}$

$\text{LLE}=\text{pchembl_mean}-\text{ALOGP}$

In [27]:
df_combined['LE'] = df_combined['pchembl_value_mean']/df_combined['heavy_atoms']*(2.303*298*0.00199)
df_combined['BEI'] = df_combined['pchembl_value_mean']*1000/df_combined["mw_freebase"]
df_combined['SEI'] = df_combined['pchembl_value_mean']*100/df_combined["psa"]
df_combined['LLE'] = df_combined['pchembl_value_mean']-df_combined["alogp"]

## Add ATC Classifications (Level 1)

Query ATC classifications (level 1) from the atc_classification and molecule_atc_classification tables.

In [28]:
sql = '''
SELECT DISTINCT mh.parent_molregno, atc.level1, level1_description
FROM atc_classification atc
INNER JOIN molecule_atc_classification matc
    ON atc.level5 = matc.level5
INNER JOIN molecule_hierarchy mh
    ON matc.molregno = mh.molregno
'''

atc_levels = pd.read_sql_query(sql, con=chembl_con)
atc_levels["l1_full"] = atc_levels["level1"] + "_" + atc_levels["level1_description"]
atc_levels

Unnamed: 0,parent_molregno,level1,level1_description,l1_full
0,2089491,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
1,608601,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
2,1567700,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,579824,D,DERMATOLOGICALS,D_DERMATOLOGICALS
4,1763584,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...,...,...
3895,675276,A,ALIMENTARY TRACT AND METABOLISM,A_ALIMENTARY TRACT AND METABOLISM
3896,2197623,N,NERVOUS SYSTEM,N_NERVOUS SYSTEM
3897,1383224,C,CARDIOVASCULAR SYSTEM,C_CARDIOVASCULAR SYSTEM
3898,675183,A,ALIMENTARY TRACT AND METABOLISM,A_ALIMENTARY TRACT AND METABOLISM


Combine ATC level annotations for the same parent_molregno into one description.

In [29]:
between_str_join = ' | '
atc_levels['atc_level1'] = atc_levels.groupby(['parent_molregno'])['l1_full'].transform(lambda x: between_str_join.join(sorted(x)))
atc_levels = atc_levels[['parent_molregno', 'atc_level1']].drop_duplicates()
atc_levels

Unnamed: 0,parent_molregno,atc_level1
0,2089491,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
1,608601,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
2,1567700,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,579824,D_DERMATOLOGICALS
4,1763584,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...
3895,675276,A_ALIMENTARY TRACT AND METABOLISM
3896,2197623,N_NERVOUS SYSTEM
3897,1383224,C_CARDIOVASCULAR SYSTEM
3898,675183,A_ALIMENTARY TRACT AND METABOLISM


Combine with previous data.

In [30]:
df_combined = df_combined.merge(atc_levels, on='parent_molregno', how = 'left')

# Add Target Class Annotations Based on ChEMBL Data

Add information about level 1 and level 2 target class annotations in ChEMBL.

In [31]:
sql = '''
SELECT DISTINCT tc.tid, 
    pc.protein_class_id, pc.pref_name, pc.short_name, pc.protein_class_desc, pc.definition,
    pfc.l1, pfc.l2
FROM protein_classification pc
-- join several tables to get the corresponding target id
INNER JOIN component_class cc
    ON pc.protein_class_id = cc.protein_class_id
INNER JOIN component_sequences cs
    ON cc.component_id = cs.component_id
INNER JOIN target_components tc
    ON cs.component_id = tc.component_id
-- join the protein_family_classification table for a faster way to traverse the hierarchy
INNER JOIN protein_family_classification pfc 
    ON  pc.protein_class_id = pfc.protein_class_id
'''

df_target_classes = pd.read_sql_query(sql, con=chembl_con)
# only interested in the target ids that are in the current dataset
current_tids = set(df_combined['tid'])
df_target_classes = df_target_classes[df_target_classes['tid'].isin(current_tids)]
df_target_classes

Unnamed: 0,tid,protein_class_id,pref_name,short_name,protein_class_desc,definition,l1,l2
0,1,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond,Enzyme,Hydrolase
1,2,1133,ABCC subfamily,MRP,transporter ntpase atp binding cassette mrp,A sequence-related subfamily of ATP-BINDING CASSETTE TRANSPORTERS that actively transport organi...,Transporter,Primary active transporter
2,3,104,Phosphodiesterase 5A,PDE_5A,enzyme phosphodiesterase pde_5 pde_5a,,Enzyme,Phosphodiesterase
3,4,1583,Voltage-gated calcium channel,VG CA,ion channel vgc vg ca,Voltage-dependent cell membrane glycoproteins selectively permeable to calcium ions. They are ca...,Ion channel,Voltage-gated ion channel
5,6,10,Oxidoreductase,Reductase,enzyme reductase,The class of all enzymes catalyzing oxidoreduction reactions. The substrate that is oxidized is ...,Enzyme,Oxidoreductase
...,...,...,...,...,...,...,...,...
11123,119972,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
11162,120055,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
11502,120396,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
11504,120399,89,Threonine protease T1A subfamily,T1A,enzyme protease threonine pbt t1a,,Enzyme,Protease


Summarise the information for a target id with several assigned target classes of level 1 into one description. If a target id has more than one assigned target class, the target class 'Unclassified protein' is discarded.

In [32]:
level = 'l1'
between_str_join = '|'
target_classes_level = df_target_classes[['tid', level]].drop_duplicates().dropna()

# remove 'Unclassified protein' from targets with more than one target class, level 1
more_than_one = target_classes_level.groupby(['tid'])[level].count()
target_classes_level = target_classes_level[
    (target_classes_level['tid'].isin(more_than_one[more_than_one == 1].index.tolist())) 
    | ((target_classes_level['tid'].isin(more_than_one[more_than_one > 1].index.tolist())) 
       & (target_classes_level['l1'] != 'Unclassified protein'))]

target_classes_level['target_class_l1'] = target_classes_level.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted(x)))
target_classes_level = target_classes_level[['tid', 'target_class_l1']].drop_duplicates()

df_combined = df_combined.merge(target_classes_level, on='tid', how = 'left')

Repeat the summary step for target classes of level 2.

In [33]:
level = 'l2'
target_classes_level = df_target_classes[['tid', level]].drop_duplicates().dropna()
target_classes_level['target_class_l2'] = target_classes_level.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted(x)))
target_classes_level = target_classes_level[['tid', 'target_class_l2']].drop_duplicates()

df_combined = df_combined.merge(target_classes_level, on='tid', how = 'left')

Instances with targets with more than one target class assigned to them.  
These could be reassigned by hand if a single target class is preferable.

In [34]:
############### TESTING: which targets have more than one level 1 target class assigned to them? ###############
test = df_combined[(df_combined['target_class_l1'].str.contains('|', regex=False))][['tid', 'target_pref_name', 'target_type', 'target_class_l1', 'target_class_l2']].drop_duplicates()
print("#Instances with >1 level 1 target class:", len(test))
test

#Instances with >1 level 1 target class: 33


Unnamed: 0,tid,target_pref_name,target_type,target_class_l1,target_class_l2
36,11531,Nuclear factor NF-kappa-B p105 subunit,SINGLE PROTEIN,Other cytosolic protein|Transcription factor,
727,104295,Cyclin-dependent kinase 4/cyclin D1,PROTEIN COMPLEX,Enzyme|Other cytosolic protein,Kinase
3898,105036,Atrial natriuretic peptide receptor,PROTEIN FAMILY,Enzyme|Membrane receptor,Lyase
6844,104811,Bcr/Abl fusion protein,CHIMERIC PROTEIN,Enzyme|Other cytosolic protein,Kinase
9125,109099,Platelet glycoprotein VI (GPVI),SINGLE PROTEIN,Adhesion|Membrane receptor,
10263,105079,NPM/ALK (Nucleophosmin/ALK tyrosine kinase receptor),CHIMERIC PROTEIN,Enzyme|Other nuclear protein,Kinase
10372,100128,Breakpoint cluster region protein,SINGLE PROTEIN,Enzyme|Other cytosolic protein,Kinase
11474,104841,Serotonin (5-HT) receptor,PROTEIN FAMILY,Ion channel|Membrane receptor,Family A G protein-coupled receptor|Ligand-gated ion channel
12159,104737,Sulfonylurea receptors; K-ATP channels,PROTEIN COMPLEX GROUP,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
13918,104730,Nuclear factor NF-kappa-B complex,PROTEIN COMPLEX GROUP,Other cytosolic protein|Transcription factor,


In [35]:
############### TESTING: which targets have more than one level 2 target class assigned to them? ###############df_combined_test = df_combined[~(df_combined['target_class_l2'].isnull())]
test = df_combined[(~df_combined['target_class_l2'].isnull()) & (df_combined['target_class_l2'].str.contains('|', regex=False))][['tid', 'target_pref_name', 'target_type', 'target_class_l1', 'target_class_l2']].drop_duplicates()
print("#Instances with >1 level 2 target class:", len(test))
test

#Instances with >1 level 2 target class: 18


Unnamed: 0,tid,target_pref_name,target_type,target_class_l1,target_class_l2
35,104677,Menin/Histone-lysine N-methyltransferase MLL,PROTEIN-PROTEIN INTERACTION,Epigenetic regulator,Reader|Writer
37,103732,Histone-lysine N-methyltransferase MLL,SINGLE PROTEIN,Epigenetic regulator,Reader|Writer
7989,101310,CREB-binding protein,SINGLE PROTEIN,Epigenetic regulator,Reader|Writer
11474,104841,Serotonin (5-HT) receptor,PROTEIN FAMILY,Ion channel|Membrane receptor,Family A G protein-coupled receptor|Ligand-gated ion channel
12159,104737,Sulfonylurea receptors; K-ATP channels,PROTEIN COMPLEX GROUP,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
15543,104758,Potassium-transporting ATPase,PROTEIN COMPLEX,Enzyme|Transporter,Hydrolase|Primary active transporter
16016,104782,"Sulfonylurea receptor 2, Kir6.2",PROTEIN COMPLEX,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
18574,104770,Sodium/potassium-transporting ATPase,PROTEIN COMPLEX GROUP,Enzyme|Ion channel|Transporter,Hydrolase|Other ion channel|Primary active transporter
19301,29,Sodium/potassium-transporting ATPase alpha-1 chain,SINGLE PROTEIN,Enzyme|Transporter,Hydrolase|Primary active transporter
22911,105734,Voltage-gated calcium channel,PROTEIN COMPLEX GROUP,Auxiliary transport protein|Ion channel,Calcium channel auxiliary subunit alpha2delta family|Calcium channel auxiliary subunit beta fami...


# Add RDKit-Based Compound Descriptors

## Built-in Compound Descriptors

Add relevant compound descriptors using built-in RDKit methods. 

In [36]:
# # add a column with RDKit molecules, used to calculate the descriptors
# PandasTools.AddMoleculeColumnToFrame(df_combined,'canonical_smiles','mol',includeFingerprints=False)

# df_combined.loc[:,'fraction_csp3'] = df_combined['mol'].apply(Descriptors.FractionCSP3)
# df_combined.loc[:,'num_aliphatic_carbocycles'] = df_combined['mol'].apply(Descriptors.NumAliphaticCarbocycles)
# df_combined.loc[:,'num_aliphatic_heterocycles'] = df_combined['mol'].apply(Descriptors.NumAliphaticHeterocycles)
# df_combined.loc[:,'num_aliphatic_rings'] = df_combined['mol'].apply(Descriptors.NumAliphaticRings)
# df_combined.loc[:,'num_aromatic_carbocycles'] = df_combined['mol'].apply(Descriptors.NumAromaticCarbocycles)
# df_combined.loc[:,'num_aromatic_heterocycles'] = df_combined['mol'].apply(Descriptors.NumAromaticHeterocycles)
# df_combined.loc[:,'num_aromatic_rings'] = df_combined['mol'].apply(Descriptors.NumAromaticRings)
# df_combined.loc[:,'num_heteroatoms'] = df_combined['mol'].apply(Descriptors.NumHeteroatoms)
# df_combined.loc[:,'num_saturated_carbocycles'] = df_combined['mol'].apply(Descriptors.NumSaturatedCarbocycles)
# df_combined.loc[:,'num_saturated_heterocycles'] = df_combined['mol'].apply(Descriptors.NumSaturatedHeterocycles)
# df_combined.loc[:,'num_saturated_rings'] = df_combined['mol'].apply(Descriptors.NumSaturatedRings)
# df_combined.loc[:,'ring_count'] = df_combined['mol'].apply(Descriptors.RingCount)
# df_combined.loc[:,'num_stereocentres'] = df_combined['mol'].apply(Chem.rdMolDescriptors.CalcNumAtomStereoCenters)

# # drop the column with RDKit molecules
# df_combined = df_combined.drop(['mol'] , axis=1)

## Aromaticity Descriptors

Add descriptors for aromaticity, using an RDKit-based method.

In [37]:
def calculate_aromatic_atoms(smiles_set):
    aromatic_atoms_dict = dict()
    aromatic_c_dict = dict()
    aromatic_n_dict = dict()
    aromatic_hetero_dict = dict()
    
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        aromatic_atoms_dict[smiles] = sum(mol.GetAtomWithIdx(i).GetIsAromatic() for i in range(mol.GetNumAtoms()))
        aromatic_c_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 6)) for i in range(mol.GetNumAtoms()))
        aromatic_n_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 7)) for i in range(mol.GetNumAtoms()))
        aromatic_hetero_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() != 6) & (mol.GetAtomWithIdx(i).GetAtomicNum() != 1)) for i in range(mol.GetNumAtoms()))
        
    return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict

In [38]:
# smiles_set = set(df_combined["canonical_smiles"])
# aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = calculate_aromatic_atoms(list(smiles_set))

# df_combined['aromatic_atoms'] = df_combined['canonical_smiles'].map(aromatic_atoms_dict)
# df_combined['aromatic_c'] = df_combined['canonical_smiles'].map(aromatic_c_dict)
# df_combined['aromatic_n'] = df_combined['canonical_smiles'].map(aromatic_n_dict)
# df_combined['aromatic_hetero'] = df_combined['canonical_smiles'].map(aromatic_hetero_dict)

## Scaffold SMILES

Add the scaffold SMILES for every molecule. For the column *scaffold_w_stereo* the stereochemistry is taken into account. For the column *scaffold_wo_stereo* the stereochemistry information is removed before calculating the scaffold.

In [39]:
# note: this takes a few minutes to calculate for all molecules
def calculate_scaffolds(smiles_set):
    scaffolds_dict = dict()
    scaffolds_no_stereo_dict = dict()
    
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        # skip acyclic molecules
        if Chem.rdMolDescriptors.CalcNumRings(mol) == 0:
            continue

        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_dict[smiles] = Chem.MolToSmiles(scaffold)
        
        # repeat after removing stereochemistry
        Chem.RemoveStereochemistry(mol)
        scaffold_no_stereo = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_no_stereo_dict[smiles] = Chem.MolToSmiles(scaffold_no_stereo)
        
    return scaffolds_dict, scaffolds_no_stereo_dict

In [40]:
# smiles_set = set(df_combined["canonical_smiles"])
# scaffolds_dict, scaffolds_no_stereo_dict = calculate_scaffolds(smiles_set)

# df_combined["scaffold_w_stereo"] = df_combined['canonical_smiles'].map(scaffolds_dict)
# df_combined['scaffold_wo_stereo'] = df_combined['canonical_smiles'].map(scaffolds_no_stereo_dict)

# Get Relevant Subsets of the Data

Calculate different subsets of the data based on binding and functional data in ChEMBL.

In [41]:
# function to calculate and return the different subsets of interest
def get_data_subsets(min_nof_cpds, data):
    # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
    comparator_counts = data[~data['pchembl_value_mean'].isnull()].groupby(['tid_mutation'])['parent_molregno'].count()
    targets_w_enough_cpds = comparator_counts[comparator_counts >= min_nof_cpds].index.tolist()
    df_enough_cpds = data.query('tid_mutation in @targets_w_enough_cpds')
    
    # Restrict the dataset further to targets with at least one compound-target pair labelled as 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT', 
    # i.e. a compound-target pair with a known interaction.
    c_dt_d_dt_targets = set(df_enough_cpds[df_enough_cpds['DTI'].isin(['D_DT', 'C3_DT', 'C2_DT', 'C1_DT', 'C0_DT'])].tid_mutation.to_list())
    df_c_dt_d_dt = df_enough_cpds.query('tid_mutation in @c_dt_d_dt_targets')
    
    # Restrict the dataset further to targets with at least one compound-target pair labelled as 'D_DT', 
    # i.e. a known drug-target interaction. 
    d_dt_targets = set(df_enough_cpds[df_enough_cpds['DTI'] == 'D_DT'].tid_mutation.to_list())
    df_d_dt = df_enough_cpds.query('tid_mutation in @d_dt_targets')
    
    return df_enough_cpds, df_c_dt_d_dt, df_d_dt

## Binding and Functional Assays

In [42]:
# consider binding and functional assays
min_nof_cpds = 100
df_combined_all = df_combined[(df_combined['only_binding'] == False)]
df_combined_all_enough_cpds, df_combined_all_c_dt_d_dt, df_combined_all_d_dt = get_data_subsets(min_nof_cpds, df_combined_all)

In [43]:
# df_combined_all.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays.csv", sep = ";", index = False)
# # df_combined_all.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays.xlsx", index = False)

# df_combined_all_enough_cpds.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_" + str(min_nof_cpds) + ".csv", sep = ";", index = False)
# # df_combined_all_enough_cpds.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_" + str(min_nof_cpds) + ".xlsx", index = False)

# df_combined_all_c_dt_d_dt.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_" + str(min_nof_cpds) + "_c_dt_d_dt.csv", sep = ";", index = False)
# # df_combined_all_c_dt_d_dt.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_" + str(min_nof_cpds) + "_c_dt_d_dt.xlsx", index = False)

# df_combined_all_d_dt.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_" + str(min_nof_cpds) + "_d_dt.csv", sep = ";", index = False)
# # df_combined_all_d_dt.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_" + str(min_nof_cpds) + "_d_dt.xlsx", index = False)

In [44]:
############### TESTING: binding and functional assays ###############
add_dataset_sizes(df_combined_all, "all assays")
add_dataset_sizes(df_combined_all_enough_cpds, "all, >= 100")
add_dataset_sizes(df_combined_all_c_dt_d_dt, "all, >= 100, c_dt and d_dt")
add_dataset_sizes(df_combined_all_d_dt, "all, >= 100, d_dt")

## Only Binding Assays

In [45]:
# consider only binding assays and therapeutic targets
min_nof_cpds = 100
df_combined_B = df_combined[(df_combined['only_binding'] == True)]
df_combined_B_enough_cpds, df_combined_B_c_dt_d_dt, df_combined_B_d_dt = get_data_subsets(min_nof_cpds, df_combined_B)

In [46]:
# df_combined_B.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding.csv", sep = ";", index = False)
# # df_combined_B.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding.xlsx", index = False)

# df_combined_B_enough_cpds.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_" + str(min_nof_cpds) + ".csv", sep = ";", index = False)
# # df_combined_B_enough_cpds.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_" + str(min_nof_cpds) + ".xlsx", index = False)

# df_combined_B_c_dt_d_dt.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_" + str(min_nof_cpds) + "_c_dt_d_dt.csv", sep = ";", index = False)
# # df_combined_B_c_dt_d_dt.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_" + str(min_nof_cpds) + "_c_dt_d_dt.xlsx", index = False)

# df_combined_B_d_dt.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_" + str(min_nof_cpds) + "_d_dt.csv", sep = ";", index = False)
# # df_combined_B_d_dt.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_" + str(min_nof_cpds) + "_d_dt.xlsx", index = False)

In [47]:
############### TESTING: binding assays ###############
add_dataset_sizes(df_combined_B, "binding")
add_dataset_sizes(df_combined_B_enough_cpds, "b, >= 100")
add_dataset_sizes(df_combined_B_c_dt_d_dt, "b, >= 100, c_dt and d_dt")
add_dataset_sizes(df_combined_B_d_dt, "b, >= 100, d_dt")

# Testing: Overview of Dataset Sizes at Different Points in the Pipeline

In [48]:
############### TESTING: development of the full dataset size ###############
print("Size of full dataset at different points")
pd.DataFrame(all_lengths,
                   columns=['type', 
                            '#mols', '#drugs', 
                            '#targets', '#drug_ targets', 
                            '#targets_ mutation', '#drug_ targets_mutation', 
                            '#cpd_tid_ pairs', '#drug_tid_ pairs',
                            '#cpd_ tid_mutation_ pairs', '#drug_ tid_mutation_ pairs'])

Size of full dataset at different points


Unnamed: 0,type,#mols,#drugs,#targets,#drug_ targets,#targets_ mutation,#drug_ targets_mutation,#cpd_tid_ pairs,#drug_tid_ pairs,#cpd_ tid_mutation_ pairs,#drug_ tid_mutation_ pairs
0,init,1009870,1735,6806,2581,8184,2983,2235264,23013,2248694,24074
1,pre DTI,1006861,812,6741,414,8104,627,2209353,1454,2222330,1960
2,post DTI,658626,812,1106,414,1990,627,1022315,1454,1033164,1960
3,cpd props,657886,810,1106,414,1990,627,1021147,1452,1031958,1958
4,all assays,657886,810,1106,414,1990,627,1021147,1452,1031958,1958
5,"all, >= 100",650556,770,673,331,701,354,1006980,1332,1012249,1431
6,"all, >= 100, c_dt and d_dt",500948,770,517,331,544,354,738388,1332,743649,1431
7,"all, >= 100, d_dt",344492,770,331,331,354,354,512610,1332,517202,1431
8,binding,505540,767,1095,405,1892,613,727670,1387,738034,1888
9,"b, >= 100",497221,719,631,310,657,332,712209,1249,717323,1347


In [49]:
############### TESTING: development of the dataset size (pchembl values required) ###############
print("Size of dataset with pchembl values at different points")
pd.DataFrame(all_lengths_pchembl,
                   columns=['type', 
                            '#mols', '#drugs', 
                            '#targets', '#drug_ targets', 
                            '#targets_ mutation', '#drug_ targets_mutation', 
                            '#cpd_tid_ pairs', '#drug_tid_ pairs',
                            '#cpd_ tid_mutation_ pairs', '#drug_ tid_mutation_ pairs'])

Size of dataset with pchembl values at different points


Unnamed: 0,type,#mols,#drugs,#targets,#drug_ targets,#targets_ mutation,#drug_ targets_mutation,#cpd_tid_ pairs,#drug_tid_ pairs,#cpd_ tid_mutation_ pairs,#drug_ tid_mutation_ pairs
0,init,1009870,1735,6806,2581,8184,2983,2235264,23013,2248694,24074
1,pre DTI,1006861,812,6741,414,8104,627,2209353,1454,2222330,1960
2,post DTI,658626,812,1106,414,1990,627,1022315,1454,1033164,1960
3,cpd props,657886,810,1106,414,1990,627,1021147,1452,1031958,1958
4,all assays,657886,810,1106,414,1990,627,1021147,1452,1031958,1958
5,"all, >= 100",650556,770,673,331,701,354,1006980,1332,1012249,1431
6,"all, >= 100, c_dt and d_dt",500948,770,517,331,544,354,738388,1332,743649,1431
7,"all, >= 100, d_dt",344492,770,331,331,354,354,512610,1332,517202,1431
8,binding,505540,767,1095,405,1892,613,727670,1387,738034,1888
9,"b, >= 100",497221,719,631,310,657,332,712209,1249,717323,1347
