# Notebook to Extract Compound-Target Interactions Based on ChEMBL Data

### Authors: Barbara Zdrazil, Lina Heinzke
### 06/2023

This notebook extracts data from ChEMBL in order to retrieve a data set for drug-target and clinical candidate-target associations as well as comparator compounds for the respective targets.

The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach:  
*Target-Based Evaluation of “Drug-Like” Properties and Ligand Efficiencies  
Paul D. Leeson, A. Patricia Bento, Anna Gaulton, Anne Hersey, Emma J. Manners, Chris J. Radoux, and Andrew R. Leach  
J. Med. Chem. 2021, 64, 11, 7210–7230  
[DOI: 10.1021/acs.jmedchem.1c00416](https://doi.org/10.1021/acs.jmedchem.1c00416)*


More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes


In [1]:
import numpy as np
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
import sqlite3
from tqdm import tqdm

Pandas settings:

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

Notebook settings:

In [3]:
chembl_version = "32"
base_path = "/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/"
path_results = os.path.join(base_path, "results")
path_sqlite3_database = os.path.join(base_path, "data", 
                                     "chembl_"+chembl_version, 
                                     "chembl_"+chembl_version+"_sqlite", 
                                     "chembl_"+chembl_version+".db")
chembl_con = sqlite3.connect(path_sqlite3_database)

# limit dataset to literature data
# Note that average pchembl_values, ligand efficiencies,
# first_publication_cpd_target_pair, first_publication_cpd_target_pair_w_pchembl and first_publication_cpd 
# will be based on literature data only if this is set to true
limit_to_literature = True

# calculate RDKit-based compound properties
calculate_RDKit = True

# write results to csv
write_to_csv = True
# write results to excel
write_to_excel = True

# write binding+functional data subsets
write_BF = False
# write binding data subsets
write_B = False
# write full dataset plus filtering columns for binding vs. binding+functional data
write_full_dataset = True
# write dataset into two subsets split based on target class
write_target_class_split = False

# Get Initial Compound-Target Data From ChEMBL

Initial query for activities + related assay, mutation, target and docs information. Compound-target pairs are required to have a pchembl value.

In [4]:
sql = '''
SELECT act.pchembl_value, 
    md.molregno as parent_molregno, md.chembl_id as parent_chemblid, md.pref_name as parent_pref_name,
    md.max_phase, md.first_approval, md.usan_year, md.black_box_warning, 
    md.prodrug, md.oral, md.parenteral, md.topical, 
    ass.assay_type, ass.tid, 
    vs.mutation,
    td.chembl_id as target_chembl_id, td.pref_name as target_pref_name, td.target_type, td.organism, 
    docs.year, docs.src_id
FROM activities act
INNER JOIN molecule_hierarchy mh 
    ON act.molregno = mh.molregno         -- act.molregno = salt_molregno
INNER JOIN molecule_dictionary md
    ON mh.parent_molregno = md.molregno   -- compound information based on parent compound
INNER JOIN assays ass 
    ON  act.assay_id = ass.assay_id
LEFT JOIN variant_sequences vs
    ON ass.variant_id = vs.variant_id
INNER JOIN target_dictionary td
    ON ass.tid = td.tid
LEFT JOIN docs
    ON act.doc_id = docs.doc_id
WHERE act.pchembl_value is not null
    and act.potential_duplicate = 0
    and act.standard_relation = '='
    and data_validity_comment is null
    and td.tid <>22226                    -- exclude unchecked targets
    and td.target_type like '%PROTEIN%'
'''

df_mols = pd.read_sql_query(sql, con=chembl_con)
# target_id_mutation
df_mols['tid_mutation'] = np.where(df_mols['mutation'].notnull(), 
                                   df_mols['tid'].astype('str')+'_'+df_mols['mutation'], 
                                   df_mols['tid'].astype('str'))
# compound-target association
df_mols['cpd_target_pair'] = df_mols.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1)
df_mols['cpd_target_pair_mutation'] = df_mols.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1)
# limit to literature data (src_id = 1)
if limit_to_literature:
    df_mols = df_mols[df_mols['src_id'] == 1]
df_mols = df_mols.drop(columns=['src_id'])
df_mols

Unnamed: 0,pchembl_value,parent_molregno,parent_chemblid,parent_pref_name,max_phase,first_approval,usan_year,black_box_warning,prodrug,oral,parenteral,topical,assay_type,tid,mutation,target_chembl_id,target_pref_name,target_type,organism,year,tid_mutation,cpd_target_pair,cpd_target_pair_mutation
0,5.40,252199,CHEMBL357278,,,,,0,-1,0,0,0,B,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,2004.0,10483,252199_10483,252199_10483
1,4.77,253534,CHEMBL357119,,,,,0,-1,0,0,0,B,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,2004.0,10483,253534_10483,253534_10483
2,6.75,253199,CHEMBL152968,,,,,0,-1,0,0,0,B,10483,,CHEMBL4632,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,2004.0,10483,253199_10483,253199_10483
3,5.22,253199,CHEMBL152968,,,,,0,-1,0,0,0,A,12594,,CHEMBL3356,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,2004.0,12594,253199_12594,253199_12594
4,4.43,253199,CHEMBL152968,,,,,0,-1,0,0,0,A,17045,,CHEMBL340,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,2004.0,17045,253199_17045,253199_17045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2692587,6.03,2723081,CHEMBL5078630,,,,,0,-1,0,0,0,B,13053,,CHEMBL2789,Estradiol 17-beta-dehydrogenase 2,SINGLE PROTEIN,Homo sapiens,2021.0,13053,2723081_13053,2723081_13053
2692588,6.69,2734238,CHEMBL5089787,,,,,0,-1,0,0,0,B,13053,,CHEMBL2789,Estradiol 17-beta-dehydrogenase 2,SINGLE PROTEIN,Homo sapiens,2021.0,13053,2734238_13053,2734238_13053
2692589,7.25,2732004,CHEMBL5087553,,,,,0,-1,0,0,0,B,13053,,CHEMBL2789,Estradiol 17-beta-dehydrogenase 2,SINGLE PROTEIN,Homo sapiens,2021.0,13053,2732004_13053,2732004_13053
2692590,7.15,2734909,CHEMBL5090458,,,,,0,-1,0,0,0,B,13053,,CHEMBL2789,Estradiol 17-beta-dehydrogenase 2,SINGLE PROTEIN,Homo sapiens,2021.0,13053,2734909_13053,2734909_13053


In [5]:
############### TESTING: method to save dataset size at any given point to array ###############
# list with sizes of full dataset
all_lengths = []
# list with sizes of dataset with pchembl values
# these statistics are purely based on removing compound-target pairs without pchembl information
# i.e., the subset of the dataset is determined by the given data parameter and not recalculated (see below)
all_lengths_pchembl = []

def calculate_dataset_sizes(data):
    now_mols = len(set(data["parent_molregno"]))
    now_targets = len(set(data["tid"]))
    now_targets_mutation = len(set(data["tid_mutation"]))
    now_pairs = len(set(data['cpd_target_pair']))
    now_pairs_mutation = len(set(data['cpd_target_pair_mutation']))
    
    if 'DTI' in data.columns:
        # drugs = compounds of a compound-target pair with a known interaction  
        data_drugs = data[data["DTI"] == "D_DT"]
    else: 
        data_drugs = data[data["max_phase"] == 4]
        
    now_drugs = len(set(data_drugs["parent_molregno"]))
    now_drug_targets = len(set(data_drugs["tid"]))
    now_drug_targets_mutation = len(set(data_drugs["tid_mutation"]))
    now_drug_pairs = len(set(data_drugs['cpd_target_pair']))
    now_drug_pairs_mutation = len(set(data_drugs['cpd_target_pair_mutation']))

    return [now_mols, now_drugs, 
            now_targets, now_drug_targets,
            now_targets_mutation, now_drug_targets_mutation,
            now_pairs, now_drug_pairs,
            now_pairs_mutation, now_drug_pairs_mutation]


def add_dataset_sizes(data, label):
    data_test = data.copy()
    all_lengths.append([label] + calculate_dataset_sizes(data_test))
    
    # restrict to data with any pchembl value (any data with a pchembl, even if it is based on only functional data)
    # these statistics are purely based on removing compound-target pairs without pchembl information
    # i.e., the subset of the dataset is determined by the given data parameter and not recalculated
    data_pchembl = data_test.dropna(subset=[x for x in data_test.columns if x.startswith('pchembl_value')], how = 'all')
    all_lengths_pchembl.append([label] + calculate_dataset_sizes(data_pchembl))

In [6]:
############### TESTING: initial query ###############
add_dataset_sizes(df_mols, "init")

# Calculate Mean, Median, and Max *pchembl* Values for Each Compound-Target Pair

The following values are set to summarise the information for compound-target pairs:  

|||
| :----------- | :----------- |
| *pchembl_value_mean* | mean pchembl value for a compound-target pair|
| *pchembl_value_max*| maximum pchembl value for a compound-target pair|
| *pchembl_value_median*| median pchembl value for a compound-target pair|
| *first_publication_cpd_target_pair* | first publication in ChEMBL with this compound-target pair |
| *first_publication_cpd_target_pair_w_pchembl* | first publication in ChEMBL with this compound-target pair and an associated pchembl value |

The values are set for 
- a subset of the dataset based on binding and functional assays (suffix '_BF') and 
- a subset of the dataset set on only binding assays (suffix '_B'). 

Therefore, there are two columns for each of the values above, one with the suffix '_BF' based on binding + functional data and one with the suffix '_B' based on only binding data.

In [7]:
def get_average_info(df, suffix):
    # pchembl mean, max, median
    df['pchembl_value_mean_'+suffix] = df.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
    df['pchembl_value_max_'+suffix] = df.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
    df['pchembl_value_median_'+suffix] = df.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
    
    # first publication of pair
    df['first_publication_cpd_target_pair_'+suffix] = df.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')
    
    # first publication of pair with pchembl value
    df_mols_all_first_publication_pchembl = df[df['pchembl_value'].notnull()] \
            .groupby(['parent_molregno', 'tid_mutation'])['year'].min().reset_index() \
            .rename(columns={'year': 'first_publication_cpd_target_pair_w_pchembl_'+suffix})
    df = df.merge(df_mols_all_first_publication_pchembl, on=['parent_molregno', 'tid_mutation'], how = 'left')
    
    # return relevant summarised information without duplicates
    df = df[['parent_molregno', 'tid_mutation', 
            'pchembl_value_mean_'+suffix, 'pchembl_value_max_'+suffix, 'pchembl_value_median_'+suffix, 
            'first_publication_cpd_target_pair_'+suffix, 'first_publication_cpd_target_pair_w_pchembl_'+suffix]].drop_duplicates()
    return df

Summarise the information for binding and functional assays:

In [8]:
suffix = 'BF'
df_mols_BF = df_mols[(df_mols['assay_type'] == 'B') | (df_mols['assay_type'] == 'F')].copy()
df_mols_BF = get_average_info(df_mols_BF, suffix)

Summarise the information for only binding assays:

In [9]:
suffix = 'B'
df_mols_B = df_mols[df_mols['assay_type'] == 'B'].copy()
df_mols_B = get_average_info(df_mols_B, suffix)

Combine both into one table with two columns per value (one with suffix '_BF' for binding+functional and one with suffix '_B' for binding).

In [10]:
# df_mols_B is a subset of the compound-target pairs of df_mols_BF
df_combined = df_mols_BF.merge(df_mols_B, 
                                on=['parent_molregno', 'tid_mutation'], how = 'left')
# left merge because df_mols may contain assays that are of other types than binding / functional
df_combined = df_combined.merge(df_mols.drop(columns=['pchembl_value', 'year', 'assay_type']).drop_duplicates(), 
                                on=['parent_molregno', 'tid_mutation'], how = 'left')

# Extract Drug-Target Interactions With Disease Relevance From the drug_mechanism Table

Extract the known drug-target interactions from ChEMBL (these include some interactions between compounds with a max_phase < 4 and targets). These will be used to determine if compound-target pairs from the activities query above are known compound-target interactions. 

Note: Compound-target pairs can be in the drug_mechanisms table even though the compound is not a drug (max_phase < 4). For ease of writing, these will be referred to as drug-target interactions as well rather than compound-target pairs with a known disease-relevant interaction. 

Only entries with a disease_efficacy of 1 are taken into account, i.e., the target is believed to play a role in the efficacy of the drug.  
*disease_efficacy: Flag to show whether the target assigned is believed to play a role in the efficacy of the drug in the indication(s) for which it is approved (1 = yes, 0 = no).*

In [11]:
sql = '''
SELECT DISTINCT mh.parent_molregno, dm.tid
FROM drug_mechanism dm
INNER JOIN molecule_hierarchy mh
    ON dm.molregno = mh.molregno
INNER JOIN molecule_dictionary md
    ON mh.parent_molregno = md.molregno
WHERE dm.disease_efficacy = 1
    and dm.tid is not null
'''

df_dti = pd.read_sql_query(sql, con=chembl_con)
df_dti

Unnamed: 0,parent_molregno,tid
0,1124,11060
1,675068,10193
2,1125,10193
3,1085,10193
4,1124,10193
...,...,...
6258,1407411,112
6259,51961,120553
6260,51961,120554
6261,442342,22228


## Map Related Target IDs To Table

Query target_relations for related target ids to increase the number of target ids for which there is data in the drug_mechanisms table.
The following mappings are considered:

||||
|:------|:-----:|-----|
|protein family |-[superset of]->| single protein|
|protein complex |-[superset of]->| single protein|
|protein complex group |-[superset of]->| single protein|
|single protein |-[equivalent to]->| single protein|
|chimeric protein |-[superset of]->| single protein|
|protein-protein interaction |-[superset of]->| single protein|

For example, for *protein family -[superset of]-> single protein* this means:  
If there is a known relevant interaction between a compound and a protein family, interactions between the compound and single proteins of that protein family are considered to be known interactions as well.

In [12]:
sql = '''
SELECT tr.tid, tr.relationship, tr.related_tid, 
    td1.pref_name as pref_name_1, td1.target_type as target_type_1, td1.organism as organism_1, 
    td2.pref_name as pref_name_2, td2.target_type as target_type_2, td2.organism as organism_2 
FROM target_relations tr
INNER JOIN target_dictionary td1
    ON tr.tid = td1.tid
INNER JOIN target_dictionary td2
    ON tr.related_tid = td2.tid
'''

df_related_targets = pd.read_sql_query(sql, con=chembl_con)
df_related_targets.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2
0,11699,SUBSET OF,104812,PI4-kinase type II,SINGLE PROTEIN,Homo sapiens,"Phosphatidylinositol 4-kinase, PI4K",PROTEIN FAMILY,Homo sapiens
1,12261,SUBSET OF,104822,c-Jun N-terminal kinase 1,SINGLE PROTEIN,Homo sapiens,"c-Jun N-terminal kinase, JNK",PROTEIN FAMILY,Homo sapiens
2,12261,SUBSET OF,118329,c-Jun N-terminal kinase 1,SINGLE PROTEIN,Homo sapiens,Mitogen-activated protein kinase 8/9,PROTEIN FAMILY,Homo sapiens
3,12755,SUBSET OF,104684,Dopamine D5 receptor,SINGLE PROTEIN,Rattus norvegicus,Dopamine receptor,PROTEIN FAMILY,Rattus norvegicus
4,12735,SUBSET OF,105018,Phosphorylase kinase gamma subunit 2,SINGLE PROTEIN,Homo sapiens,Phosphorylase kinase,PROTEIN COMPLEX GROUP,Homo sapiens


In [13]:
protein_family_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN FAMILY") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_group_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX GROUP") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

single_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "SINGLE PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "EQUIVALENT TO")]

chimeric_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "CHIMERIC PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

ppi_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN-PROTEIN INTERACTION") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

relevant_mappings = pd.concat([protein_family_mapping, 
                               protein_complex_mapping, 
                               protein_complex_group_mapping,
                               single_protein_mapping, 
                               chimeric_protein_mapping, 
                               ppi_mapping])
relevant_mappings.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2
269,104829,SUPERSET OF,11111,Cyclooxygenase,PROTEIN FAMILY,Bos taurus,Cyclooxygenase-2,SINGLE PROTEIN,Bos taurus
270,104829,SUPERSET OF,17019,Cyclooxygenase,PROTEIN FAMILY,Bos taurus,Cyclooxygenase-1,SINGLE PROTEIN,Bos taurus
273,104745,SUPERSET OF,10868,Leukotriene B4 receptor,PROTEIN FAMILY,Homo sapiens,Leukotriene B4 receptor 2,SINGLE PROTEIN,Homo sapiens
274,104745,SUPERSET OF,10542,Leukotriene B4 receptor,PROTEIN FAMILY,Homo sapiens,Leukotriene B4 receptor 1,SINGLE PROTEIN,Homo sapiens
288,104699,SUPERSET OF,12854,Adenosine A2 receptor,PROTEIN FAMILY,Rattus norvegicus,Adenosine A2b receptor,SINGLE PROTEIN,Rattus norvegicus


Combine the drug-target-interactions (DTI) and target ids (dti_tids) from the drug mechanism table with the information based on the mapped target ids.

In [14]:
# drug-target-interactions (DTI) and target ids (dti_tids) based on the drug_mechanisms table
DTIs_original = set(df_dti.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
dti_tids_original = set(df_dti['tid'])

# drug-target-interactions (DTI) and target ids (dti_tids) based on mapped target ids
df_dti_mapped_targets = df_dti.merge(relevant_mappings, on = 'tid', how = 'inner')
DTIs_mapped = set(df_dti_mapped_targets.agg('{0[parent_molregno]}_{0[related_tid]}'.format, axis=1))
dti_tids_mapped = set(df_dti_mapped_targets['related_tid'])

# combined drug-target-interactions (DTI) and target ids (dti_tids) 
# based on drug_mechanisms table and mapped target ids
DTIs_set = DTIs_original.union(DTIs_mapped)
dti_tids_set = dti_tids_original.union(dti_tids_mapped)

In [15]:
############### TESTING: before adding additional compounds ###############
add_dataset_sizes(df_combined, "pre dm table")

# Add Compounds From the drug_mechanism Table to the Dataset

Add compound-target pairs from the drug_mechanism table that are not in the dataset based on the initial ChEMBL query.
These are compound-target pairs for which there is no associated pchembl value data.
Since the pairs are known interactions, they are added to the dataset despite not having a pchembl value.

Collect compound-target pairs including mapped targets from the drug_mechanisms table. 

In [16]:
cpd_target_pairs = pd.concat([df_dti[['parent_molregno', 'tid']], 
                              df_dti_mapped_targets[['parent_molregno', 'related_tid']]
                              .rename(columns={'related_tid': 'tid'})]).drop_duplicates()

# Set columns existing in the df_combined table.
# None of the targets from the drug mechanism table have any mutation annotation, hence tid_mutation = tid
cpd_target_pairs['tid_mutation'] = cpd_target_pairs['tid'].astype('str')
cpd_target_pairs['cpd_target_pair'] = cpd_target_pairs.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1)
cpd_target_pairs['cpd_target_pair_mutation'] = cpd_target_pairs.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1)

Add a new column *in_dm_table* which is set to True if the compound target pair (taking mutation annotations into account) is in the drug_mechanism table. 

In [17]:
# New column: is the compound target pair (taking mutation annotations into account) in the drug_mechanism table?
cpd_target_pairs['in_dm_table'] = True

Set *in_dm_table* for the initial dataset based on the ChEMBL query (df_combined).

In [18]:
df_combined['in_dm_table'] = False
df_combined.loc[(df_combined['cpd_target_pair_mutation'].isin(set(cpd_target_pairs['cpd_target_pair_mutation']))), 
                'in_dm_table'] = True

Limit the pairs to the ones that are not yet in the dataset.  
Mutation annotations are taken into account. 
Therefore, *(cpd A, target B without mutation)* will be added if a pchembl is present for *(cpd A, target B with mutation C)* but not for *(cpd A, target B without mutation)*.

In [19]:
# pairs for which there is no information based in binding or functional assays in the original ChEMBL query
cpd_target_pairs = cpd_target_pairs[~(cpd_target_pairs['cpd_target_pair_mutation'].isin(set(df_combined['cpd_target_pair_mutation'])))].copy()
print("#Pairs not yet present based on binding or functional assays:", len(cpd_target_pairs))

#Pairs not yet present based on binding or functional assays: 8849


Query compound and target information and combine it with the new compound-target pairs table.

In [20]:
sql = '''
SELECT md.molregno as parent_molregno, 
    md.chembl_id as parent_chemblid, md.pref_name as parent_pref_name,
    md.max_phase, md.first_approval, md.usan_year, md.black_box_warning, 
    md.prodrug, md.oral, md.parenteral, md.topical
FROM molecule_dictionary md
'''

df_compound_info = pd.read_sql_query(sql, con=chembl_con)
cpd_target_pairs = cpd_target_pairs.merge(df_compound_info, on = 'parent_molregno', how = 'left')

In [21]:
sql = '''
SELECT td.tid, td.chembl_id as target_chembl_id, td.pref_name as target_pref_name, td.target_type, td.organism
FROM target_dictionary td
'''

df_target_info = pd.read_sql_query(sql, con=chembl_con)
# Fix problems with null not being recognised as None
df_target_info.loc[df_target_info['organism'].astype(str) == 'null', 'organism'] = None
cpd_target_pairs = cpd_target_pairs.merge(df_target_info, on = 'tid', how = 'left')

Combined data of existing query with new compound-target pairs.

In [22]:
df_combined = pd.concat([df_combined, cpd_target_pairs]) 

Add a new column *keep_for_binding* which is set to True if the row should be kept if you want to limit the dataset to only data based on binding assays.   
Rows are kept if 
- there is a binding data-based pchembl value or
- the compound-target pair is in the drug_mechanism table

In [23]:
df_combined['keep_for_binding'] = False
df_combined.loc[((df_combined['pchembl_value_mean_B'].notnull()) | 
                (df_combined['in_dm_table'] == True )), 'keep_for_binding'] = True

# DTI (Drug-Target Interaction) Annotations

Every compound-target pair is assigned a DTI (drug target interaction) annotation.  

The assignement is based on three questions:
- Is the compound-target pair in the drug_mechanisms table? = Is it a known relevant compound-target interaction?
- What is the max_phase of the compound? = Is it a drug / clinical compound?
- Is the target in the drug_mechanisms table = Is it a therapeutic target?

The assigments are based on the following table:

|in drug_mechanisms table?|max_phase?|therapeutic target?|DTI annotation|explanation|
|:-----:|:-----:|:-----:|:-----:|:-----|
|yes|4|-|D_DT|drug - drug target|
|yes|3|-|C3_DT|clinical candidate in phase 3 - drug target|
|yes|2|-|C2_DT|clinical candidate in phase 2 - drug target|
|yes|1|-|C1_DT|clinical candidate in phase 1 - drug target|
|yes|<1|-|C0_DT|compound in unknown clinical phase\[1\] - drug target|
|no|-|yes|DT|drug target|
|no|-|no|NDT|not drug target|


\[1\] There are three possible annotations in ChEMBL with max_phase not between 1 and 4:
- 0.5 = early phase 1 clinical trials  
- -1 = clinical phase unknown for drug or clinical candidate drug, i.e., where ChEMBL cannot assign a clinical phase
- NULL = preclinical compounds with bioactivity data

All three are grouped together into the annotation C0_DT.

Identify which targets are therapeutic targets (= are they in the drug_mechanism table?) and add the field *therapeutic_target* that indicates whether target is a known therapeutic target.  

In [24]:
df_combined['therapeutic_target'] = df_combined['tid'].isin(dti_tids_set)

Assign the annotations based on the table.

In [25]:
# Compound-target pairs from the drug mechanism table
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 4)), 'DTI'] = "D_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 3)), 'DTI'] = "C3_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 2)), 'DTI'] = "C2_DT"
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & (df_combined['max_phase'] == 1)), 'DTI'] = "C1_DT"
# Compounds that are in the drug_mechanism table but don't have a known phase between 1-4:
df_combined.loc[(df_combined['cpd_target_pair'].isin(DTIs_set) & 
                 (~df_combined['max_phase'].isin([1, 2, 3, 4]))), 'DTI'] = "C0_DT"

# Target from the drug mechanism table
df_combined.loc[((~df_combined['cpd_target_pair'].isin(DTIs_set)) 
                 & (df_combined['therapeutic_target'] == True)), 'DTI'] = "DT"

# Other compound-target pairs
# if target is not a therapeutic target, 'cpd_target_pair' cannot be in DTIs_set
# (~df_combined['cpd_target_pair'].isin(DTIs_set)) is included for clarity
df_combined.loc[((~df_combined['cpd_target_pair'].isin(DTIs_set)) 
                 & (df_combined['therapeutic_target'] == False)), 'DTI'] = "NDT"

In [26]:
############### TESTING: before discarding NDT rows ###############
add_dataset_sizes(df_combined, "pre DTI")

Discard rows that were annotated with NDT, i.e., compound-target pairs that are not in the drug_mechanisms table and for which the target was also not in the drug_mechanisms table (not a comparator compound).

In [27]:
# discard NDT rows
df_combined = df_combined[(df_combined['DTI'].isin(['D_DT', 'C3_DT', 'C2_DT', 'C1_DT', 'C0_DT', 'DT']))]

In [28]:
############### TESTING: after discarding NDT rows ###############
add_dataset_sizes(df_combined, "post DTI")

# Add Compound Properties Based on ChEMBL Data

## Add First Appearance of Compound in the Literature

Query and calculate the first appearance of a compound in the literature based on ChEMBL data.

In [29]:
# first appearance of a compound in the literature 
# information about salts is aggregated in the parent
sql = '''
SELECT DISTINCT docs.year, docs.src_id, mh.parent_molregno
FROM docs
LEFT JOIN compound_records cr
    ON docs.doc_id = cr.doc_id
INNER JOIN molecule_hierarchy mh 
    ON cr.molregno = mh.molregno   -- cr.molregno = salt_molregno
WHERE docs.year is not null
'''

df_docs = pd.read_sql_query(sql, con=chembl_con)
if limit_to_literature:
    df_docs = df_docs[df_docs['src_id'] == 1]
df_docs = df_docs.drop(columns=['src_id'])
df_docs['first_publication_cpd'] = df_docs.groupby('parent_molregno')['year'].transform('min')
df_docs = df_docs[['parent_molregno', 'first_publication_cpd']].drop_duplicates()
df_docs

Unnamed: 0,parent_molregno,first_publication_cpd
0,4941,1974
1,921,1974
2,1005421,1976
3,1750777,1976
4,1750778,1976
...,...,...
2030050,2729296,2022
2030051,2730285,2022
2030052,2722420,2022
2030053,2719912,2022


Combine with previous data.

In [30]:
df_combined = df_combined.merge(df_docs, on = 'parent_molregno', how = 'left')
df_combined

Unnamed: 0,parent_molregno,tid_mutation,pchembl_value_mean_BF,pchembl_value_max_BF,pchembl_value_median_BF,first_publication_cpd_target_pair_BF,first_publication_cpd_target_pair_w_pchembl_BF,pchembl_value_mean_B,pchembl_value_max_B,pchembl_value_median_B,first_publication_cpd_target_pair_B,first_publication_cpd_target_pair_w_pchembl_B,parent_chemblid,parent_pref_name,max_phase,first_approval,usan_year,black_box_warning,prodrug,oral,parenteral,topical,tid,mutation,target_chembl_id,target_pref_name,target_type,organism,cpd_target_pair,cpd_target_pair_mutation,in_dm_table,keep_for_binding,therapeutic_target,DTI,first_publication_cpd
0,100708,50,7.53,7.53,7.53,1982.0,1982.0,7.53,7.53,7.53,1982.0,1982.0,CHEMBL305153,,,,,0,-1,0,0,0,50,,CHEMBL213,Beta-1 adrenergic receptor,SINGLE PROTEIN,Homo sapiens,100708_50,100708_50,False,True,True,DT,1982.0
1,1798744,136,6.96,6.96,6.96,1986.0,1986.0,6.96,6.96,6.96,1986.0,1986.0,CHEMBL3350133,,,,,0,-1,0,0,0,136,,CHEMBL236,Delta opioid receptor,SINGLE PROTEIN,Homo sapiens,1798744_136,1798744_136,False,True,True,DT,1986.0
2,88622,248,6.00,6.00,6.00,1987.0,1987.0,6.00,6.00,6.00,1987.0,1987.0,CHEMBL57825,,,,,0,-1,0,0,0,248,,CHEMBL1835,Thromboxane-A synthase,SINGLE PROTEIN,Homo sapiens,88622_248,88622_248,False,True,True,DT,1987.0
3,97517,72,5.01,5.01,5.01,1998.0,1998.0,5.01,5.01,5.01,1998.0,1998.0,CHEMBL303519,,,,,0,-1,0,0,0,72,,CHEMBL217,Dopamine D2 receptor,SINGLE PROTEIN,Homo sapiens,97517_72,97517_72,False,True,True,DT,1998.0
4,97517,130,5.25,5.25,5.25,1998.0,1998.0,5.25,5.25,5.25,1998.0,1998.0,CHEMBL303519,,,,,0,-1,0,0,0,130,,CHEMBL234,Dopamine D3 receptor,SINGLE PROTEIN,Homo sapiens,97517_130,97517_130,False,True,True,DT,1998.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627951,581767,11400,,,,,,,,,,,CHEMBL571209,INDOXIMOD,2.0,,2013.0,0,0,0,0,0,11400,,CHEMBL2842,Serine/threonine-protein kinase mTOR,SINGLE PROTEIN,Homo sapiens,581767_11400,581767_11400,True,True,True,C2_DT,2010.0
627952,1927154,201,,,,,,,,,,,CHEMBL3545305,CIBINETIDE,2.0,,2015.0,0,0,0,0,0,201,,CHEMBL1817,Erythropoietin receptor,SINGLE PROTEIN,Homo sapiens,1927154_201,1927154_201,True,True,True,C2_DT,
627953,16973,103450,,,,,,,,,,,CHEMBL277535,BIFONAZOLE,4.0,,1981.0,0,0,0,0,0,103450,,CHEMBL1163121,3-hydroxy-3-methylglutaryl-coenzyme A reductase,SINGLE PROTEIN,Schizosaccharomyces pombe 972h-,16973_103450,16973_103450,True,True,True,D_DT,1995.0
627954,1407411,148,,,,,,,,,,,CHEMBL2135460,TERLIPRESSIN,4.0,2022.0,2006.0,1,0,0,1,0,148,,CHEMBL1921,Vasopressin V1b receptor,SINGLE PROTEIN,Homo sapiens,1407411_148,1407411_148,True,True,True,D_DT,2010.0


## Add ChEMBL Compound Properties and Compound Structures

Add compound properties and structures based on the compound_properties table and the compound_structures table. 

In [31]:
sql = '''
SELECT DISTINCT mh.parent_molregno, 
    cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
    cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
    cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
    cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, 
    struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
FROM compound_properties cp
INNER JOIN molecule_hierarchy mh
    ON cp.molregno = mh.parent_molregno
INNER JOIN compound_structures struct
    ON mh.parent_molregno = struct.molregno
'''

df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
df_cpd_props.head()

Unnamed: 0,parent_molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,2657369,411.94,3.82,4.0,2.0,64.26,5.0,N,0.0,11.94,6.11,3.98,3.96,NEUTRAL,411.94,3.0,29.0,0.67,411.1826,C22H26ClN5O,6.0,2.0,0.0,InChI=1S/C22H26ClN5O/c1-27(2)18-6-3-15(4-7-18)14-24-21(29)16-9-11-28(12-10-16)22-25-19-8-5-17(23...,AAAADVYFXUUVEO-UHFFFAOYSA-N,CN(C)c1ccc(CNC(=O)C2CCN(c3nc4ccc(Cl)cc4[nH]3)CC2)cc1
1,477782,506.37,3.04,8.0,2.0,116.43,8.0,N,1.0,,6.5,2.16,2.11,NEUTRAL,506.37,2.0,27.0,0.53,506.0485,C17H23IN4O4S,8.0,3.0,1.0,"InChI=1S/C17H23IN4O4S/c1-10(2)11-7-14(25-3)12(18)8-13(11)26-15-9-21-17(22-16(15)19)20-5-6-27(4,2...",AAAAEENPAALFRN-UHFFFAOYSA-N,COc1cc(C(C)C)c(Oc2cnc(NCCS(C)(=O)=O)nc2N)cc1I
2,2237474,927.28,7.03,11.0,7.0,252.91,41.0,N,4.0,4.13,,8.43,5.36,ACID,927.28,0.0,65.0,0.02,926.6555,C49H90N4O12,16.0,8.0,4.0,InChI=1S/C49H90N4O12/c1-5-8-10-12-14-16-18-20-22-24-26-28-30-37(31-29-27-25-23-21-19-17-15-13-11...,AAAAJHGLNDAXFP-VNKVACROSA-N,CCCCCCCCCCCCCCC(CCCCCCCCCCCCCC)C(=O)OC[C@H]1OC(O)[C@H](NC(C)=O)[C@@H](OCC(=O)N[C@@H](CC)C(=O)N[C...
3,412019,271.32,1.72,2.0,2.0,65.2,1.0,N,0.0,13.43,,0.77,0.77,NEUTRAL,271.32,2.0,20.0,0.83,271.1321,C15H17N3O2,5.0,2.0,0.0,"InChI=1S/C15H17N3O2/c1-8-7-16-14(19)13-12(8)10-6-9(15(20)18(2)3)4-5-11(10)17-13/h4-6,8,17H,7H2,1...",AAAAKTROWFNLEP-UHFFFAOYSA-N,CC1CNC(=O)c2[nH]c3ccc(C(=O)N(C)C)cc3c21
4,26284,323.35,2.13,4.0,1.0,71.53,3.0,N,0.0,,4.73,1.13,1.13,NEUTRAL,323.35,2.0,24.0,0.94,323.127,C18H17N3O3,6.0,1.0,0.0,InChI=1S/C18H17N3O3/c1-11(22)20-10-17-16-8-14-7-12(13-3-2-6-19-9-13)4-5-15(14)21(16)18(23)24-17/...,AAAATQFUBIBQIS-IRXDYDNUSA-N,CC(=O)NC[C@@H]1OC(=O)N2c3ccc(-c4cccnc4)cc3C[C@@H]12


Combine with previous data.

In [32]:
df_combined = df_combined.merge(df_cpd_props, on = 'parent_molregno', how = 'left')

In [33]:
############### TESTING: compound props ###############
add_dataset_sizes(df_combined, "cpd props")

## Calculate Ligand Efficiency (LE) Metrics

Calculate the ligand efficiency metrics for the compounds based on the mean pchembl values for a compound-target pair and the following ligand efficiency (LE) formulas:

$\text{LE} = \frac{\Delta\text{G}}{\text{HA}}$
where $ \Delta\text{G} = − RT \ln(K_d)$, $− RT\ln(K_i)$, or $− RT\ln(IC_{50})$

$\text{LE}=\frac{(2.303 \cdot 298 \cdot 0.00199 \cdot \text{pchembl\_value})} {\text{heavy\_atoms}}$


$\text{BEI}=\frac{\text{pchembl\_mean} \cdot 1000}{\text{mw\_freebase}}$

$\text{SEI}=\frac{\text{pchembl\_mean} \cdot 100}{\text{PSA}}$

$\text{LLE}=\text{pchembl\_mean}-\text{ALOGP}$

Since LE metrics are based on pchembl values, they are calculated twice.
Once for the pchembl values based on binding + functional assays (BF) and once for the pchembl values based on binding assays only (B).

In [34]:
for suffix in ['BF', 'B']:
    df_combined['LE_'+suffix] = df_combined['pchembl_value_mean_'+suffix]/df_combined['heavy_atoms']*(2.303*298*0.00199)
    # replace infinity values with None as they are not useful
    df_combined['LE_'+suffix] = df_combined['LE_'+suffix].replace(np.inf, None)
    
    df_combined['BEI_'+suffix] = df_combined['pchembl_value_mean_'+suffix]*1000/df_combined["mw_freebase"]
    df_combined['BEI_'+suffix] = df_combined['BEI_'+suffix].replace(np.inf, None)
    
    df_combined['SEI_'+suffix] = df_combined['pchembl_value_mean_'+suffix]*100/df_combined["psa"]
    df_combined['SEI_'+suffix] = df_combined['SEI_'+suffix].replace(np.inf, None)
    
    df_combined['LLE_'+suffix] = df_combined['pchembl_value_mean_'+suffix]-df_combined["alogp"]
    
    df_combined = df_combined.astype({
    'LE_'+suffix: 'float64',
    'BEI_'+suffix: 'float64',
    'SEI_'+suffix: 'float64',
    'LLE_'+suffix: 'float64'
    })

## Add ATC Classifications (Level 1)

Query ATC classifications (level 1) from the atc_classification and molecule_atc_classification tables.

In [35]:
sql = '''
SELECT DISTINCT mh.parent_molregno, atc.level1, level1_description
FROM atc_classification atc
INNER JOIN molecule_atc_classification matc
    ON atc.level5 = matc.level5
INNER JOIN molecule_hierarchy mh
    ON matc.molregno = mh.molregno
'''

atc_levels = pd.read_sql_query(sql, con=chembl_con)
atc_levels["l1_full"] = atc_levels["level1"] + "_" + atc_levels["level1_description"]
atc_levels

Unnamed: 0,parent_molregno,level1,level1_description,l1_full
0,628156,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
1,2286380,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
2,2089491,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,608601,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
4,1927225,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...,...,...
4003,675276,A,ALIMENTARY TRACT AND METABOLISM,A_ALIMENTARY TRACT AND METABOLISM
4004,2197623,N,NERVOUS SYSTEM,N_NERVOUS SYSTEM
4005,1383224,C,CARDIOVASCULAR SYSTEM,C_CARDIOVASCULAR SYSTEM
4006,675183,A,ALIMENTARY TRACT AND METABOLISM,A_ALIMENTARY TRACT AND METABOLISM


Combine ATC level annotations for the same parent_molregno into one description.

In [36]:
between_str_join = ' | '
atc_levels['atc_level1'] = atc_levels.groupby(['parent_molregno'])['l1_full'].transform(lambda x: between_str_join.join(sorted(x)))
atc_levels = atc_levels[['parent_molregno', 'atc_level1']].drop_duplicates()
atc_levels

Unnamed: 0,parent_molregno,atc_level1
0,628156,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
1,2286380,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
2,2089491,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
3,608601,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
4,1927225,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
...,...,...
4003,675276,A_ALIMENTARY TRACT AND METABOLISM
4004,2197623,N_NERVOUS SYSTEM
4005,1383224,C_CARDIOVASCULAR SYSTEM
4006,675183,A_ALIMENTARY TRACT AND METABOLISM


Combine with previous data.

In [37]:
df_combined = df_combined.merge(atc_levels, on='parent_molregno', how = 'left')

# Add Target Class Annotations Based on ChEMBL Data

Add information about level 1 and level 2 target class annotations in ChEMBL.

In [38]:
sql = '''
SELECT DISTINCT tc.tid, 
    pc.protein_class_id, pc.pref_name, pc.short_name, pc.protein_class_desc, pc.definition
FROM protein_classification pc
-- join several tables to get the corresponding target id
INNER JOIN component_class cc
    ON pc.protein_class_id = cc.protein_class_id
INNER JOIN component_sequences cs
    ON cc.component_id = cs.component_id
INNER JOIN target_components tc
    ON cs.component_id = tc.component_id
'''

df_target_classes = pd.read_sql_query(sql, con=chembl_con)
# only interested in the target ids that are in the current dataset
current_tids = set(df_combined['tid'])
df_target_classes = df_target_classes[df_target_classes['tid'].isin(current_tids)]
df_target_classes

Unnamed: 0,tid,protein_class_id,pref_name,short_name,protein_class_desc,definition
0,1,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond
1,2,1133,ABCC subfamily,MRP,transporter ntpase atp binding cassette mrp,A sequence-related subfamily of ATP-BINDING CASSETTE TRANSPORTERS that actively transport organi...
2,3,104,Phosphodiesterase 5A,PDE_5A,enzyme phosphodiesterase pde_5 pde_5a,
3,4,1583,Voltage-gated calcium channel,VG CA,ion channel vgc vg ca,Voltage-dependent cell membrane glycoproteins selectively permeable to calcium ions. They are ca...
4,5,422,Nicotinic acetylcholine receptor alpha subunit,CHRN alpha,ion channel lgic ach chrn alpha,
...,...,...,...,...,...,...
11530,120396,601,Unclassified protein,Unclassified,unclassified,
11532,120399,89,Threonine protease T1A subfamily,T1A,enzyme protease threonine pbt t1a,
11648,120506,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond
11683,120553,601,Unclassified protein,Unclassified,unclassified,


Query the protein_classification table for the protein classification hierarchy and merge it with the target class information for specific tids.

In [39]:
sql = '''
WITH RECURSIVE pc_hierarchy AS (
    SELECT protein_class_id,
            parent_id,
            class_level,
            pref_name AS names
    FROM protein_classification
    WHERE parent_id IS NULL

    UNION ALL
   
    SELECT pc.protein_class_id,
        pc.parent_id,
        pc.class_level,
        -- recursively add current protein classification pref_name to string, separated by |
        pc_hierarchy.names || '|' || pc.pref_name 
    FROM protein_classification pc, pc_hierarchy
    WHERE pc.parent_id = pc_hierarchy.protein_class_id
)
SELECT *
FROM pc_hierarchy
'''


target_class_hierarchy = pd.read_sql_query(sql, con=chembl_con)
target_class_hierarchy[['l0', 'l1', 'l2', 'l3', 'l4', 'l5', 'l6']] = target_class_hierarchy['names'].str.split('|', expand=True)
target_class_hierarchy = target_class_hierarchy[target_class_hierarchy['protein_class_id'] != 0][['protein_class_id', 'l1', 'l2']]
df_target_classes = df_target_classes.merge(target_class_hierarchy, on = 'protein_class_id', how = 'left')
df_target_classes

Unnamed: 0,tid,protein_class_id,pref_name,short_name,protein_class_desc,definition,l1,l2
0,1,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond,Enzyme,Hydrolase
1,2,1133,ABCC subfamily,MRP,transporter ntpase atp binding cassette mrp,A sequence-related subfamily of ATP-BINDING CASSETTE TRANSPORTERS that actively transport organi...,Transporter,Primary active transporter
2,3,104,Phosphodiesterase 5A,PDE_5A,enzyme phosphodiesterase pde_5 pde_5a,,Enzyme,Phosphodiesterase
3,4,1583,Voltage-gated calcium channel,VG CA,ion channel vgc vg ca,Voltage-dependent cell membrane glycoproteins selectively permeable to calcium ions. They are ca...,Ion channel,Voltage-gated ion channel
4,5,422,Nicotinic acetylcholine receptor alpha subunit,CHRN alpha,ion channel lgic ach chrn alpha,,Ion channel,Ligand-gated ion channel
...,...,...,...,...,...,...,...,...
1828,120396,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
1829,120399,89,Threonine protease T1A subfamily,T1A,enzyme protease threonine pbt t1a,,Enzyme,Protease
1830,120506,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond,Enzyme,Hydrolase
1831,120553,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,


Summarise the information for a target id with several assigned target classes of level 1 into one description. If a target id has more than one assigned target class, the target class 'Unclassified protein' is discarded.

In [40]:
level = 'l1'
between_str_join = '|'
target_classes_level1 = df_target_classes[['tid', level]].drop_duplicates().dropna()

# remove 'Unclassified protein' from targets with more than one target class, level 1
nof_classes = target_classes_level1.groupby(['tid'])[level].count()
target_classes_level1 = target_classes_level1[
    (target_classes_level1['tid'].isin(nof_classes[nof_classes == 1].index.tolist())) 
    | ((target_classes_level1['tid'].isin(nof_classes[nof_classes > 1].index.tolist())) 
       & (target_classes_level1['l1'] != 'Unclassified protein'))]

target_classes_level1['target_class_l1'] = target_classes_level1.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted(x)))
target_classes_level1 = target_classes_level1[['tid', 'target_class_l1']].drop_duplicates()

df_combined = df_combined.merge(target_classes_level1, on='tid', how = 'left')

Repeat the summary step for target classes of level 2.

In [41]:
level = 'l2'
target_classes_level2 = df_target_classes[['tid', level]].drop_duplicates().dropna()
target_classes_level2['target_class_l2'] = target_classes_level2.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted(x)))
target_classes_level2 = target_classes_level2[['tid', 'target_class_l2']].drop_duplicates()

df_combined = df_combined.merge(target_classes_level2, on='tid', how = 'left')

Instances with targets with more than one target class assigned to them.  
These could be reassigned by hand if a single target class is preferable.

In [42]:
############### TESTING: which targets have more than one level 1 target class assigned to them? ###############
test = df_combined[(df_combined['target_class_l1'].notnull()) & (df_combined['target_class_l1'].str.contains('|', regex=False))][['tid', 'target_pref_name', 'target_type', 'target_class_l1', 'target_class_l2']].drop_duplicates()
print("#Instances with >1 level 1 target class:", len(test))
test

#Instances with >1 level 1 target class: 47


Unnamed: 0,tid,target_pref_name,target_type,target_class_l1,target_class_l2
283,104295,Cyclin-dependent kinase 4/cyclin D1,PROTEIN COMPLEX,Enzyme|Other cytosolic protein,Kinase
1578,105036,Atrial natriuretic peptide receptor,PROTEIN FAMILY,Enzyme|Membrane receptor,Lyase
4803,104737,Sulfonylurea receptors; K-ATP channels,PROTEIN COMPLEX GROUP,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
7329,104770,Sodium/potassium-transporting ATPase,PROTEIN COMPLEX GROUP,Enzyme|Ion channel|Transporter,Hydrolase|Other ion channel|Primary active transporter
13811,104852,"Sulfonylurea receptor 1, Kir6.2",PROTEIN COMPLEX,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
13948,104758,Potassium-transporting ATPase,PROTEIN COMPLEX,Enzyme|Transporter,Hydrolase|Primary active transporter
14046,104782,"Sulfonylurea receptor 2, Kir6.2",PROTEIN COMPLEX,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
24571,104717,Gamma-secretase,PROTEIN COMPLEX,Enzyme|Ion channel,Other ion channel|Protease
25059,104841,Serotonin (5-HT) receptor,PROTEIN FAMILY,Ion channel|Membrane receptor,Family A G protein-coupled receptor|Ligand-gated ion channel
63485,190,Leukocyte adhesion glycoprotein LFA-1 alpha,SINGLE PROTEIN,Adhesion|Membrane receptor,


In [43]:
############### TESTING: which targets have more than one level 2 target class assigned to them? ###############
test = df_combined[(df_combined['target_class_l2'].notnull()) & (df_combined['target_class_l2'].str.contains('|', regex=False))][['tid', 'target_pref_name', 'target_type', 'target_class_l1', 'target_class_l2']].drop_duplicates()
print("#Instances with >1 level 2 target class:", len(test))
test

#Instances with >1 level 2 target class: 24


Unnamed: 0,tid,target_pref_name,target_type,target_class_l1,target_class_l2
4803,104737,Sulfonylurea receptors; K-ATP channels,PROTEIN COMPLEX GROUP,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
7329,104770,Sodium/potassium-transporting ATPase,PROTEIN COMPLEX GROUP,Enzyme|Ion channel|Transporter,Hydrolase|Other ion channel|Primary active transporter
13811,104852,"Sulfonylurea receptor 1, Kir6.2",PROTEIN COMPLEX,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
13948,104758,Potassium-transporting ATPase,PROTEIN COMPLEX,Enzyme|Transporter,Hydrolase|Primary active transporter
14046,104782,"Sulfonylurea receptor 2, Kir6.2",PROTEIN COMPLEX,Ion channel|Transporter,Primary active transporter|Voltage-gated ion channel
24571,104717,Gamma-secretase,PROTEIN COMPLEX,Enzyme|Ion channel,Other ion channel|Protease
25059,104841,Serotonin (5-HT) receptor,PROTEIN FAMILY,Ion channel|Membrane receptor,Family A G protein-coupled receptor|Ligand-gated ion channel
128192,11564,Sarcoplasmic/endoplasmic reticulum calcium ATPase 1,SINGLE PROTEIN,Enzyme|Transporter,Hydrolase|Primary active transporter
240881,101310,CREB-binding protein,SINGLE PROTEIN,Epigenetic regulator,Reader|Writer
400647,322,DNA (cytosine-5)-methyltransferase 3A,SINGLE PROTEIN,Epigenetic regulator,Reader|Writer


# Add RDKit-Based Compound Descriptors

## Built-in Compound Descriptors

Add relevant compound descriptors using built-in RDKit methods. 

In [44]:
if calculate_RDKit:
    # Split table into two sections; PandasTools has difficulties working with null values
    df_combined_w_smiles = df_combined[df_combined['canonical_smiles'].notnull()].copy()
    df_combined_wo_smiles = df_combined[df_combined['canonical_smiles'].isnull()].copy()

    # add a column with RDKit molecules, used to calculate the descriptors
    PandasTools.AddMoleculeColumnToFrame(df_combined_w_smiles, 'canonical_smiles', 'mol', includeFingerprints=False)

    df_combined_w_smiles.loc[:,'fraction_csp3'] = df_combined_w_smiles['mol'].apply(Descriptors.FractionCSP3)
    df_combined_w_smiles.loc[:,'num_aliphatic_carbocycles'] = df_combined_w_smiles['mol'].apply(Descriptors.NumAliphaticCarbocycles)
    df_combined_w_smiles.loc[:,'num_aliphatic_heterocycles'] = df_combined_w_smiles['mol'].apply(Descriptors.NumAliphaticHeterocycles)
    df_combined_w_smiles.loc[:,'num_aliphatic_rings'] = df_combined_w_smiles['mol'].apply(Descriptors.NumAliphaticRings)
    df_combined_w_smiles.loc[:,'num_aromatic_carbocycles'] = df_combined_w_smiles['mol'].apply(Descriptors.NumAromaticCarbocycles)
    df_combined_w_smiles.loc[:,'num_aromatic_heterocycles'] = df_combined_w_smiles['mol'].apply(Descriptors.NumAromaticHeterocycles)
    df_combined_w_smiles.loc[:,'num_aromatic_rings'] = df_combined_w_smiles['mol'].apply(Descriptors.NumAromaticRings)
    df_combined_w_smiles.loc[:,'num_heteroatoms'] = df_combined_w_smiles['mol'].apply(Descriptors.NumHeteroatoms)
    df_combined_w_smiles.loc[:,'num_saturated_carbocycles'] = df_combined_w_smiles['mol'].apply(Descriptors.NumSaturatedCarbocycles)
    df_combined_w_smiles.loc[:,'num_saturated_heterocycles'] = df_combined_w_smiles['mol'].apply(Descriptors.NumSaturatedHeterocycles)
    df_combined_w_smiles.loc[:,'num_saturated_rings'] = df_combined_w_smiles['mol'].apply(Descriptors.NumSaturatedRings)
    df_combined_w_smiles.loc[:,'ring_count'] = df_combined_w_smiles['mol'].apply(Descriptors.RingCount)
    df_combined_w_smiles.loc[:,'num_stereocentres'] = df_combined_w_smiles['mol'].apply(Chem.rdMolDescriptors.CalcNumAtomStereoCenters)

    # add scaffolds
    PandasTools.AddMurckoToFrame(df_combined_w_smiles, 'mol', 'scaffold_w_stereo')
    # remove stereo information of the molecule to add scaffolds without stereo information
    df_combined_w_smiles['mol'].apply(Chem.RemoveStereochemistry)
    PandasTools.AddMurckoToFrame(df_combined_w_smiles, 'mol', 'scaffold_wo_stereo')

    # drop the column with RDKit molecules
    df_combined_w_smiles = df_combined_w_smiles.drop(['mol'] , axis=1)

    # combined both sections of the table
    df_combined = pd.concat([df_combined_w_smiles, 
                             df_combined_wo_smiles]).reset_index(drop=True)

## Aromaticity Descriptors

Add descriptors for aromaticity, using an RDKit-based method.

In [45]:
def calculate_aromatic_atoms(smiles_set):
    aromatic_atoms_dict = dict()
    aromatic_c_dict = dict()
    aromatic_n_dict = dict()
    aromatic_hetero_dict = dict()
    
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        aromatic_atoms_dict[smiles] = sum(mol.GetAtomWithIdx(i).GetIsAromatic() for i in range(mol.GetNumAtoms()))
        aromatic_c_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 6)) for i in range(mol.GetNumAtoms()))
        aromatic_n_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 7)) for i in range(mol.GetNumAtoms()))
        aromatic_hetero_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() != 6) & (mol.GetAtomWithIdx(i).GetAtomicNum() != 1)) for i in range(mol.GetNumAtoms()))
        
    return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict

In [46]:
if calculate_RDKit:
    # use df_combined_w_smiles to exclude null values
    smiles_set = set(df_combined_w_smiles["canonical_smiles"])
    aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = calculate_aromatic_atoms(smiles_set)

    df_combined['aromatic_atoms'] = df_combined['canonical_smiles'].map(aromatic_atoms_dict)
    df_combined['aromatic_c'] = df_combined['canonical_smiles'].map(aromatic_c_dict)
    df_combined['aromatic_n'] = df_combined['canonical_smiles'].map(aromatic_n_dict)
    df_combined['aromatic_hetero'] = df_combined['canonical_smiles'].map(aromatic_hetero_dict)

100%|██████████| 402410/402410 [01:55<00:00, 3491.25it/s]


# Remove Compounds Without a Smiles and Mixtures

## Remove Compounds With a Smiles Containing a '.'

Double-check that rows with a SMILES containing a '.' are the parent structures, i.e., there was no error in using salt information instead of parent information.  
These compounds are salts or mixtures and will be removed in the next step.

In [47]:
sql = '''
SELECT DISTINCT mh.molregno as salt_molregno, mh.parent_molregno
FROM molecule_hierarchy mh
'''
df_hierarchy = pd.read_sql_query(sql, con=chembl_con)

smiles_with_dot = df_combined[df_combined['canonical_smiles'].notnull() & df_combined['canonical_smiles'].str.contains('.', regex=False)]
smiles_with_dot = smiles_with_dot[['canonical_smiles', 'parent_molregno']].drop_duplicates()

issue_ctr = 0
for parent_molregno in set(smiles_with_dot['parent_molregno']):
    # the molrego should occur at least once as a parent_molregno
    if not len(df_hierarchy[df_hierarchy['parent_molregno'] == parent_molregno]) > 0:
        display(df_hierarchy[df_hierarchy['parent_molregno'] == parent_molregno])
        issue_ctr += 1
    # if it occurs as a salt_molregno, its parent_molregno should be identical, 
    # i.e., the molregno is a parent_molregno
    df_salt_molregno = df_hierarchy[df_hierarchy['salt_molregno'] == parent_molregno]
    if not df_salt_molregno['salt_molregno'].equals(df_salt_molregno['parent_molregno']):
        display(df_hierarchy[df_hierarchy['salt_molregno'] == parent_molregno])
        issue_ctr += 1

print("#Problems:", issue_ctr)

#Problems: 0


Double check that the parent_molregno does indeed correspond to a smiles containing a dot.

In [48]:
sql = '''
SELECT DISTINCT mh.parent_molregno, struct.canonical_smiles
FROM molecule_hierarchy mh
INNER JOIN compound_structures struct
    ON mh.parent_molregno = struct.molregno
'''
df_parent_smiles = pd.read_sql_query(sql, con=chembl_con)

issue_ctr = 0
for parent_molregno in set(smiles_with_dot['parent_molregno']):
    df_smiles = df_parent_smiles[df_parent_smiles['parent_molregno'] == parent_molregno]['canonical_smiles'].item()
    smiles_w_dot = smiles_with_dot[smiles_with_dot['parent_molregno'] == parent_molregno]['canonical_smiles'].item()
    if df_smiles != smiles_w_dot:
        print(parent_molregno)
        print(df_smiles)
        print(smiles_w_dot)
        print()
        issue_ctr += 1

print("#Problems:", issue_ctr)

#Problems: 0


Remove rows that contain a SMILES with a dot or that don't have a SMILES.

In [49]:
len_missing_smiles = len(df_combined[df_combined['canonical_smiles'].isnull()])
len_smiles_w_dot = len(df_combined[df_combined['parent_molregno'].isin(set(smiles_with_dot['parent_molregno']))])
print("{:30} {}".format("#Rows w/o SMILES:", len_missing_smiles))
print("{:30} {}".format("#Rows w SMILES with dot:", len_smiles_w_dot))
print("{:30} {}".format("Predicted size after removal:", len(df_combined)-len_missing_smiles-len_smiles_w_dot))
df_combined = df_combined[(df_combined['canonical_smiles'].notnull()) & ~(df_combined['parent_molregno'].isin(set(smiles_with_dot['parent_molregno'])))]
print("{:30} {}".format("Size:", len(df_combined)))

#Rows w/o SMILES:              2694
#Rows w SMILES with dot:       273
Predicted size after removal:  624989
Size:                          624989


# Get Relevant Subsets of the Data

## Preparation

Change nan values and empty strings to None for consistency. 

In [50]:
# Change all None / nan values to None
df_combined = df_combined.where(pd.notnull(df_combined), None)
# replace empty strings with None
df_combined = df_combined.replace('', None).reset_index(drop=True)

Set the relevant columns to Ints instead of floats.  
Round columns with floats to 4 decimal places.

In [51]:
# Set relevant columns to int
df_combined = df_combined.astype({
    'first_approval': 'Int64',
    'usan_year': 'Int64',
    'first_publication_cpd_target_pair_BF': 'Int64',
    'first_publication_cpd_target_pair_w_pchembl_BF': 'Int64',
    'first_publication_cpd_target_pair_B': 'Int64',
    'first_publication_cpd_target_pair_w_pchembl_B': 'Int64',
    'first_publication_cpd': 'Int64',
    'hba': 'Int64',
    'hbd': 'Int64',
    'rtb': 'Int64',
    'num_ro5_violations': 'Int64',
    'aromatic_rings': 'Int64',
    'heavy_atoms': 'Int64',
    'hba_lipinski': 'Int64',
    'hbd_lipinski': 'Int64',
    'num_lipinski_ro5_violations': 'Int64'
})

if calculate_RDKit:
    df_combined = df_combined.astype({
        'num_aliphatic_carbocycles': 'Int64',
        'num_aliphatic_heterocycles': 'Int64',
        'num_aliphatic_rings': 'Int64',
        'num_aromatic_carbocycles': 'Int64',
        'num_aromatic_heterocycles': 'Int64',
        'num_aromatic_rings': 'Int64',
        'num_heteroatoms': 'Int64',
        'num_saturated_carbocycles': 'Int64',
        'num_saturated_heterocycles': 'Int64',
        'num_saturated_rings': 'Int64',
        'ring_count': 'Int64',
        'num_stereocentres': 'Int64',
        'aromatic_atoms': 'Int64',
        'aromatic_c': 'Int64',
        'aromatic_n': 'Int64',
        'aromatic_hetero': 'Int64'
    })

Round to 4 decimal places

In [52]:
# Round float columns to 4 decimal places
decimal_places = 4
for i, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()):
    if ((dtype == 'float64') or (dtype == 'Float64')) and col != 'max_phase':
        df_combined[col] = df_combined[col].round(decimals=decimal_places)

Reorder columns.

In [53]:
print("#Columns before: ", len(df_combined.columns))
if calculate_RDKit:
    df_combined = df_combined[['parent_molregno', 'parent_chemblid', 'parent_pref_name', 
                               'max_phase', 'first_approval', 'usan_year', 'black_box_warning', 
                               'prodrug', 'oral', 'parenteral', 'topical',
                               'tid', 'mutation', 'target_chembl_id', 'target_pref_name', 'target_type', 
                               'organism', 'tid_mutation',
                               'cpd_target_pair', 'cpd_target_pair_mutation',
                               'pchembl_value_mean_BF', 'pchembl_value_max_BF', 'pchembl_value_median_BF',
                               'first_publication_cpd_target_pair_BF', 'first_publication_cpd_target_pair_w_pchembl_BF',
                               'pchembl_value_mean_B', 'pchembl_value_max_B', 'pchembl_value_median_B',
                               'first_publication_cpd_target_pair_B', 'first_publication_cpd_target_pair_w_pchembl_B', 
                               'therapeutic_target', 'DTI',
                               'first_publication_cpd', 'mw_freebase', 'alogp', 'hba', 'hbd', 'psa',
                               'rtb', 'ro3_pass', 'num_ro5_violations', 'cx_most_apka', 'cx_most_bpka',
                               'cx_logp', 'cx_logd', 'molecular_species', 'full_mwt', 'aromatic_rings',
                               'heavy_atoms', 'qed_weighted', 'mw_monoisotopic', 'full_molformula',
                               'hba_lipinski', 'hbd_lipinski', 'num_lipinski_ro5_violations', 
                               'standard_inchi', 'standard_inchi_key', 'canonical_smiles', 
                               'LE_B', 'BEI_B', 'SEI_B', 'LLE_B',
                               'LE_BF', 'BEI_BF', 'SEI_BF', 'LLE_BF',
                               'atc_level1', 'target_class_l1', 'target_class_l2', 
                               'fraction_csp3', 
                               'num_aliphatic_carbocycles', 'num_aliphatic_heterocycles', 'num_aliphatic_rings', 
                               'num_aromatic_carbocycles', 'num_aromatic_heterocycles', 'num_aromatic_rings',
                               'num_heteroatoms', 
                               'num_saturated_carbocycles', 'num_saturated_heterocycles', 'num_saturated_rings', 
                               'ring_count', 'num_stereocentres', 
                               'aromatic_atoms', 'aromatic_c', 'aromatic_n', 'aromatic_hetero', 
                               'scaffold_w_stereo', 'scaffold_wo_stereo',
                               'in_dm_table', 'keep_for_binding']]
else:
    df_combined = df_combined[['parent_molregno', 'parent_chemblid', 'parent_pref_name', 
                               'max_phase', 'first_approval', 'usan_year', 'black_box_warning', 
                               'prodrug', 'oral', 'parenteral', 'topical',
                               'tid', 'mutation', 'target_chembl_id', 'target_pref_name', 'target_type', 
                               'organism', 'tid_mutation',
                               'cpd_target_pair', 'cpd_target_pair_mutation',
                               'pchembl_value_mean_BF', 'pchembl_value_max_BF', 'pchembl_value_median_BF',
                               'first_publication_cpd_target_pair_BF', 'first_publication_cpd_target_pair_w_pchembl_BF',
                               'pchembl_value_mean_B', 'pchembl_value_max_B', 'pchembl_value_median_B',
                               'first_publication_cpd_target_pair_B', 'first_publication_cpd_target_pair_w_pchembl_B', 
                               'therapeutic_target', 'DTI',
                               'first_publication_cpd', 'mw_freebase', 'alogp', 'hba', 'hbd', 'psa',
                               'rtb', 'ro3_pass', 'num_ro5_violations', 'cx_most_apka', 'cx_most_bpka',
                               'cx_logp', 'cx_logd', 'molecular_species', 'full_mwt', 'aromatic_rings',
                               'heavy_atoms', 'qed_weighted', 'mw_monoisotopic', 'full_molformula',
                               'hba_lipinski', 'hbd_lipinski', 'num_lipinski_ro5_violations', 
                               'standard_inchi', 'standard_inchi_key', 'canonical_smiles', 
                               'LE_B', 'BEI_B', 'SEI_B', 'LLE_B',
                               'LE_BF', 'BEI_BF', 'SEI_BF', 'LLE_BF',
                               'atc_level1', 'target_class_l1', 'target_class_l2', 
                               'in_dm_table', 'keep_for_binding']]

print("#Columns afterwards: ", len(df_combined.columns))

#Columns before:  90
#Columns afterwards:  90


## Sanity Checks

Check if there are mixed types in columns with dtype=object.

In [54]:
# check that there are no mixed types in object columns
issue_ctr = 0
for i, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()):
    if dtype == object:
        a = set(df_combined[col])
        b = set(df_combined[col].astype(str))
        x = a-b
        y = b-a
        # is there a difference in the two sets
        if(len(a-b) > 0 or len(b-a) > 0):
            # is the difference something other than None being mapped to 'None' (string)?
            if len(x.difference({None})) > 0 or len(y.difference({'None'})) > 0:
                print("Mixed types in column ", col)
                print(a-b, '/', b-a)
                issue_ctr += 1

print("#Problems:", issue_ctr)

#Problems: 0


Check the different column types and the respective number of null values in each column.

In [55]:
print("{:3} {:50} {:10} {}".format("", "column", "type", "#null values"))
for i, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()):
    print("{:3} {:50} {:10} {}".format(i, col, str(dtype), len(df_combined[df_combined[col].isnull()])))

    column                                             type       #null values
  0 parent_molregno                                    int64      0
  1 parent_chemblid                                    object     0
  2 parent_pref_name                                   object     597283
  3 max_phase                                          float64    604852
  4 first_approval                                     Int64      616241
  5 usan_year                                          Int64      612631
  7 prodrug                                            int64      0
  8 oral                                               int64      0
  9 parenteral                                         int64      0
 10 topical                                            int64      0
 11 tid                                                int64      0
 12 mutation                                           object     608714
 13 target_chembl_id                                   object     0
 14 target_p

Check if any columns contain nan or null which aren't recognised as null values. 

In [56]:
# Do any columns have potential issues with null types?
issue_ctr = 0
for i, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()):
    if 'nan' in set(df_combined[df_combined[col].notnull()][col].astype(str)):
        print("Issue with nan in column", col)
        issue_ctr += 1
    if 'null' in set(df_combined[df_combined[col].notnull()][col].astype(str)):
        print("Issue with null in column", col)
        issue_ctr += 1

print("#Problems:", issue_ctr)

#Problems: 0


Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF) are in the drug_mechanism table.  
Note that this is not true for the pchembl_x_B columns based on binding data. They may be in the table because there is data based on functional assays but no data based on binding assays. 

In [57]:
# All pchembl_value_x_BF columns without a pchembl should be in the dm table
issue_ctr = 0
for pchembl in ['pchembl_value_mean_BF', 'pchembl_value_max_BF', 'pchembl_value_median_BF']:
    if not df_combined[(df_combined[pchembl].isnull())].equals(
        df_combined[(df_combined['in_dm_table'] == True) & (df_combined[pchembl].isnull())]):
        print("Problem with", pchembl)
        issue_ctr += 1
        
print("#Problems:", issue_ctr)

#Problems: 0


Check that ligand efficiency metrics are only null when at least one of the values used to calculate them is null.

In [58]:
# Ligand efficiency metrics are only null when at least one of the values used to calculate them is null.
issue_ctr = 0
for suffix in ['BF', 'B']:
    if not df_combined[(df_combined['LE_'+suffix].isnull())].equals(
    df_combined[(df_combined['pchembl_value_mean_'+suffix].isnull()) 
                | (df_combined['heavy_atoms'].isnull()) 
                | (df_combined['heavy_atoms'] == 0)]):
        print("Problem with LE_"+suffix)
        issue_ctr += 1

    if not df_combined[(df_combined['BEI_'+suffix].isnull())].equals(
    df_combined[(df_combined['pchembl_value_mean_'+suffix].isnull()) 
                | (df_combined['mw_freebase'].isnull()) 
                | (df_combined['mw_freebase'] == 0)]):
        print("Problem with BEI_"+suffix)
        issue_ctr += 1

    if not df_combined[(df_combined['SEI_'+suffix].isnull())].equals(
    df_combined[(df_combined['pchembl_value_mean_'+suffix].isnull()) 
                | (df_combined['psa'].isnull())
                | (df_combined['psa'] == 0)]):
        print("Problem with SEI_"+suffix)
        issue_ctr += 1
        
    if not df_combined[(df_combined['LLE_'+suffix].isnull())].equals(
    df_combined[(df_combined['pchembl_value_mean_'+suffix].isnull()) 
                | (df_combined['alogp'].isnull())]):
        print("Problem with LLE_"+suffix)
        issue_ctr += 1
        
print("#Problems:", issue_ctr)

#Problems: 0


Check that compound props are only null if the property in the parent_molregno is not in the compound props table or if the value in the compound props table is null.

In [59]:
# Check for issues with compound properties that are null
issue_ctr = 0

# missing values because the parent_molregno is not in the compound props table
no_cpd_prop_info = len(df_combined[~df_combined['parent_molregno'].isin(set(df_cpd_props['parent_molregno']))])

for col in df_cpd_props.columns:
    if col != 'parent_molregno':
        # missing values because the compound props query returns null (exists but is null)
        missing_values = len(df_combined[df_combined['parent_molregno'].isin(set(df_cpd_props[df_cpd_props[col].isnull()]['parent_molregno']))])
        null_values = no_cpd_prop_info+missing_values
        if null_values != len(df_combined[df_combined[col].isnull()]):
            print("Problem with column", col)
            issue_ctr += 1
        
print("#Problems:", issue_ctr)

#Problems: 0


Check that atc_level1 and target class information is only null if the parent_molregno / target id is not in the respective table.

In [60]:
# issues with atc or target classes
issue_ctr = 0

if not df_combined[(df_combined['atc_level1'].isnull())].equals(
    df_combined[~df_combined['parent_molregno'].isin(set(atc_levels['parent_molregno']))]):
    print("Problem with atc_level1")
    issue_ctr += 1
    
if not df_combined[(df_combined['target_class_l1'].isnull())].equals(
    df_combined[~df_combined['tid'].isin(set(target_classes_level1['tid']))]):
    print("Problem with target_class_l1")
    issue_ctr += 1
    
if not df_combined[(df_combined['target_class_l2'].isnull())].equals(
    df_combined[~df_combined['tid'].isin(set(target_classes_level2['tid']))]):
    print("Problem with target_class_l2")
    issue_ctr += 1

print("#Problems:", issue_ctr)

#Problems: 0


Check that columns set by the RDKit are only null if there is no canonical SMILES for the molecule.  
Scaffolds are excluded from this test because they can be None if the molecule is acyclic. 

In [61]:
if calculate_RDKit:
    # issues with RDKit methods
    issue_ctr = 0

    for col in ['fraction_csp3', 'num_aliphatic_carbocycles', 'num_aliphatic_heterocycles', 'num_aliphatic_rings', 
                'num_aromatic_carbocycles', 'num_aromatic_heterocycles', 'num_aromatic_rings', 
                'num_heteroatoms', 'num_saturated_carbocycles', 'num_saturated_heterocycles', 
                'num_saturated_rings', 'ring_count', 'num_stereocentres',
                'aromatic_atoms', 'aromatic_c', 'aromatic_n', 'aromatic_hetero']:
        if len(df_combined[df_combined[col].isnull()]) != len(df_combined[df_combined['canonical_smiles'].isnull()].copy()):
            print("Problem with ", col)
            issue_ctr += 1

    print("#Problems:", issue_ctr)

#Problems: 0


## Calculate Subsets

Calculate different subsets of the data based on binding and functional data in ChEMBL.

In [62]:
# function to calculate and return the different subsets of interest
def get_data_subsets(data, min_nof_cpds, desc):
    if desc == 'B':
        drop_desc = 'BF'
    else:
        drop_desc = 'B'
    data = data.drop(columns=['pchembl_value_mean_'+drop_desc, 
                              'pchembl_value_max_'+drop_desc, 
                              'pchembl_value_median_'+drop_desc, 
                              'first_publication_cpd_target_pair_'+drop_desc, 
                              'first_publication_cpd_target_pair_w_pchembl_'+drop_desc, 
                              'LE_'+drop_desc,
                              'BEI_'+drop_desc,
                              'SEI_'+drop_desc,
                              'LLE_'+drop_desc]).drop_duplicates()
    
    # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
    comparator_counts = data[data['pchembl_value_mean_'+desc].notnull()].groupby(['tid_mutation'])['parent_molregno'].count()
    targets_w_enough_cpds = comparator_counts[comparator_counts >= min_nof_cpds].index.tolist()
    df_enough_cpds = data.query('tid_mutation in @targets_w_enough_cpds')
    
    # Restrict the dataset further to targets with at least one compound-target pair labelled as 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT', 
    # i.e., compound-target pairs with a known interactions.
    c_dt_d_dt_targets = set(df_enough_cpds[df_enough_cpds['DTI'].isin(['D_DT', 'C3_DT', 'C2_DT', 'C1_DT', 'C0_DT'])].tid_mutation.to_list())
    df_c_dt_d_dt = df_enough_cpds.query('tid_mutation in @c_dt_d_dt_targets')
    
    # Restrict the dataset further to targets with at least one compound-target pair labelled as 'D_DT', 
    # i.e., known drug-target interactions. 
    d_dt_targets = set(df_enough_cpds[df_enough_cpds['DTI'] == 'D_DT'].tid_mutation.to_list())
    df_d_dt = df_enough_cpds.query('tid_mutation in @d_dt_targets')
    
    return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt

Function to write a dataset to csv and excel.

In [63]:
def write_output(df, filename):
    """
    Write dataframe df to outputfile named filename.

    :return: Returns False if writing to excel was unsuccessful, True otherwise.
    :rtype: bool
    """
    if write_to_csv:
        df.to_csv(filename+".csv", sep = ";", index = False)
    if write_to_excel:
        try:
            with pd.ExcelWriter(filename + ".xlsx",engine='xlsxwriter') as writer: 
                writer.book.use_zip64()
                df.to_excel(writer, index = False)
        except ValueError as e: # full dataset may be too large to write to excel
            # remove empty file in case of error to avoid confusion
            if os.path.exists(filename + ".xlsx"):
                os.remove(filename + ".xlsx")
            print(e)
            return False
    return True

## Binding and Functional Assays

In [64]:
min_nof_cpds_BF = 100
# consider binding and functional assays
# assay description = binding+functional
desc = 'BF'
df_combined_BF = df_combined.copy()
df_combined_BF, df_combined_BF_enough_cpds, df_combined_BF_c_dt_d_dt, df_combined_BF_d_dt = get_data_subsets(df_combined_BF, min_nof_cpds_BF, desc)

Write to csv and excel.

In [65]:
if write_BF:
    # note that this is almost identical to the full dataset which will be saved later on
    # however, the binding-related columns are dropped
    name_BF = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_BF")
    write_BF_success = write_output(df_combined_BF, name_BF)

    name_BF_100 = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_BF_"+ str(min_nof_cpds_BF))
    write_BF_success &= write_output(df_combined_BF_enough_cpds, name_BF_100)
    
    name_BF_100_c_dt_d_dt = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_BF_"+ str(min_nof_cpds_BF) + "_c_dt_d_dt")
    write_BF_success &= write_output(df_combined_BF_c_dt_d_dt, name_BF_100_c_dt_d_dt)

    name_BF_100_d_dt = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_BF_"+ str(min_nof_cpds_BF) + "_d_dt")
    write_BF_success &= write_output(df_combined_BF_d_dt, name_BF_100_d_dt)

In [66]:
############### TESTING: binding and functional assays ###############
add_dataset_sizes(df_combined_BF, "all assays")
add_dataset_sizes(df_combined_BF_enough_cpds, "all, >= 100")
add_dataset_sizes(df_combined_BF_c_dt_d_dt, "all, >= 100, c_dt and d_dt")
add_dataset_sizes(df_combined_BF_d_dt, "all, >= 100, d_dt")

## Only Binding Assays

In [67]:
min_nof_cpds_B = 100
# consider only binding assays
# assay description = binding
desc = 'B'
df_combined_B = df_combined[df_combined['keep_for_binding'] == True].copy()
df_combined_B, df_combined_B_enough_cpds, df_combined_B_c_dt_d_dt, df_combined_B_d_dt = get_data_subsets(df_combined_B, min_nof_cpds_B, desc)

Write to csv and excel.

In [68]:
if write_B:
    name_B = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_B")
    write_B_success = write_output(df_combined_B, name_B)

    name_B_100 = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_B_"+ str(min_nof_cpds_B))
    write_B_success &= write_output(df_combined_B_enough_cpds, name_B_100)

    name_B_100_c_dt_d_dt = os.path.join(path_results, path_results+"ChEMBL"+chembl_version+"_CTI_B_"+ str(min_nof_cpds_B) + "_c_dt_d_dt")
    write_B_success &= write_output(df_combined_B_c_dt_d_dt, name_B_100_c_dt_d_dt)

    name_B_100_d_dt = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_B_"+ str(min_nof_cpds_B) + "_d_dt")
    write_B_success &= write_output(df_combined_B_d_dt, name_B_100_d_dt)

In [69]:
############### TESTING: binding assays ###############
add_dataset_sizes(df_combined_B, "binding")
add_dataset_sizes(df_combined_B_enough_cpds, "b, >= 100")
add_dataset_sizes(df_combined_B_c_dt_d_dt, "b, >= 100, c_dt and d_dt")
add_dataset_sizes(df_combined_B_d_dt, "b, >= 100, d_dt")

## Full Dataset

Set filtering values, e.g.,  
if *BF_100* = True, these rows should e kept to get the BF_100 dataset (based on binding and functional data with at least 100 comparator compounds per target).

In [70]:
for df, name in zip([df_combined_BF_enough_cpds, 
                     df_combined_BF_c_dt_d_dt, 
                     df_combined_BF_d_dt, 
                     df_combined_B_enough_cpds, 
                     df_combined_B_c_dt_d_dt, 
                     df_combined_B_d_dt
                    ], 
                    ['BF_'+str(min_nof_cpds_BF), 
                     'BF_'+str(min_nof_cpds_BF)+'_c_dt_d_dt', 
                     'BF_'+str(min_nof_cpds_BF)+'_d_dt', 
                     'B_'+str(min_nof_cpds_B), 
                     'B_'+str(min_nof_cpds_B)+'_c_dt_d_dt', 
                     'B_'+str(min_nof_cpds_B)+'_d_dt']):
    df_combined[name] = False
    df_combined.loc[(df_combined.index.isin(df.index)), name] = True
    # check that filtering works
    if not df_combined[df_combined[name]==True][df.columns].equals(df):
        print("Problem with", name)
        issue_ctr += 1
        
print("Number of problems:", issue_ctr)

Number of problems: 0


Write full dataset to csv and excel.

In [71]:
name_all = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_all")
if write_full_dataset: 
    write_full_success = write_output(df_combined, name_all)

## Split Into Two Subsets Based on Target Classes

Split full dataset into two excel files (grouped by target class: kinases, proteases, membrane receptors vs. other).

Analyse the distribution of target classes to confirm that the split splits the dataset into roughly equal subsets.

In [72]:
if write_target_class_split:
    target_class_counts = df_combined.groupby(['target_class_l1', 
                                            'target_class_l2'], dropna=False)['cpd_target_pair_mutation'].count().reset_index()
    classes_by_count = target_class_counts.sort_values( by=['cpd_target_pair_mutation'], ascending=False).reset_index(drop=True)
    if classes_by_count['cpd_target_pair_mutation'].sum() != len(df_combined):
        print('Error in calculating the numbers per group!')

    membrane_receptors = classes_by_count[classes_by_count['target_class_l1'].notnull() & classes_by_count['target_class_l1'].str.contains('Membrane receptor')]
    nof_membrane_receptors = membrane_receptors['cpd_target_pair_mutation'].sum()
    kinases = classes_by_count[classes_by_count['target_class_l2'].notnull() & classes_by_count['target_class_l2'].str.contains('Kinase')]
    nof_kinases = kinases['cpd_target_pair_mutation'].sum()
    proteases = classes_by_count[classes_by_count['target_class_l2'].notnull() & classes_by_count['target_class_l2'].str.contains('Protease')]
    nof_proteases = proteases['cpd_target_pair_mutation'].sum()

    print("#Rows:", classes_by_count['cpd_target_pair_mutation'].sum())
    print("#Membrane receptors:", nof_membrane_receptors)
    print("#Kinases:", nof_kinases)
    print("#Proteases:", nof_proteases)
    print()
    print("Half of #Rows:", classes_by_count['cpd_target_pair_mutation'].sum()/2)
    print("#Membrane receptors + kinases + proteases:", nof_membrane_receptors+nof_kinases+nof_proteases)

Write the distribution of the different target classes into an excel file.

In [73]:
if write_target_class_split:
    indices = list(membrane_receptors.index) + list(kinases.index) + list(proteases.index)
    classes_by_count['dataset'] = 1
    classes_by_count.loc[classes_by_count.index.isin(indices), 'dataset'] = 0
    
    classes_by_count_name = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_classes_by_count")
    print(classes_by_count_name)
    write_output(classes_by_count, classes_by_count_name)

Split dataset into two subsets based on the target classes chosen above.

In [74]:
if write_target_class_split:
    membrane_receptors = df_combined[df_combined['target_class_l1'].notnull() & df_combined['target_class_l1'].str.contains('Membrane receptor')]
    kinases = df_combined[df_combined['target_class_l2'].notnull() & df_combined['target_class_l2'].str.contains('Kinase')]
    proteases = df_combined[df_combined['target_class_l2'].notnull() & df_combined['target_class_l2'].str.contains('Protease')]

    # membrane receptors, kinases and proteases
    dataset_0 = pd.concat([membrane_receptors, kinases, proteases]).drop_duplicates()
    # rest
    dataset_1 = df_combined.loc[~df_combined.index.isin(dataset_0.index)]

    print("Size membrane receptors + kinases + proteases (0):", len(dataset_0))
    print("Size other targets (1):", len(dataset_1))
    print("Size 0 + 1:", len(dataset_0) + len(dataset_1))
    print("Size df_combined:", len(df_combined))

Write the subsets into two excel files.

In [75]:
if write_target_class_split:
    name_membrane_kinase_protease = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_membrane_receptor_kinase_protease_targets")
    write_target_class_success = write_output(dataset_0, name_membrane_kinase_protease)
    
    name_other = os.path.join(path_results, "ChEMBL"+chembl_version+"_CTI_other_targets")
    write_target_class_success &= write_output(dataset_1, name_other)

## Writing / Reading Sanity Checks

Check that output files can be written and read and are identical to the original dataframes.

In [76]:
def test_equality(current_file, read_file_name, assay_type, file_type_list):
    curr_file_copy = current_file.copy().reset_index(drop=True)
        
    for file_type in file_type_list:
        if file_type == 'csv':
            try:
                read_file = pd.read_csv(read_file_name+".csv", sep = ";", 
                   dtype={'mutation': 'str', 
                              'tid_mutation': 'str', 
                              'atc_level1': 'str', 
                              'target_class_l2': 'str', 
                              'ro3_pass': 'str', 
                              'molecular_species': 'str', 
                              'full_molformula': 'str', 
                              'standard_inchi': 'str', 
                              'standard_inchi_key': 'str', 
                              'canonical_smiles': 'str', 
                              'scaffold_w_stereo': 'str', 
                              'scaffold_wo_stereo': 'str', 
                             })
            except FileNotFoundError:
                print(read_file_name+".csv not found")
                continue
        elif file_type == 'xlsx':
            try:
                read_file = pd.read_excel(read_file_name+".xlsx")
            except FileNotFoundError:
                print(read_file_name+".xlsx not found")
                continue
        
        if assay_type == 'BF' or assay_type == 'all':
            read_file = read_file.astype({'first_publication_cpd_target_pair_BF': 'Int64',
                                        'first_publication_cpd_target_pair_w_pchembl_BF': 'Int64',
                                       })
        if assay_type == 'B' or assay_type == 'all':
            read_file = read_file.astype({'first_publication_cpd_target_pair_B': 'Int64',
                                        'first_publication_cpd_target_pair_w_pchembl_B': 'Int64',
                                       })
        read_file = read_file.astype({'first_approval': 'Int64',
                                    'usan_year': 'Int64',
                                    'first_publication_cpd': 'Int64',
                                    'hba': 'Int64',
                                    'hbd': 'Int64',
                                    'rtb': 'Int64',
                                    'num_ro5_violations': 'Int64',
                                    'aromatic_rings': 'Int64',
                                    'heavy_atoms': 'Int64',
                                    'hba_lipinski': 'Int64',
                                    'hbd_lipinski': 'Int64',
                                    'num_lipinski_ro5_violations': 'Int64',
                                   })
        if calculate_RDKit:
            read_file = read_file.astype({'num_aliphatic_carbocycles': 'Int64',
                                        'num_aliphatic_heterocycles': 'Int64',
                                        'num_aliphatic_rings': 'Int64',
                                        'num_aromatic_carbocycles': 'Int64',
                                        'num_aromatic_heterocycles': 'Int64',
                                        'num_aromatic_rings': 'Int64',
                                        'num_heteroatoms': 'Int64',
                                        'num_saturated_carbocycles': 'Int64',
                                        'num_saturated_heterocycles': 'Int64',
                                        'num_saturated_rings': 'Int64',
                                        'ring_count': 'Int64',
                                        'num_stereocentres': 'Int64',
                                        'aromatic_atoms': 'Int64',
                                        'aromatic_c': 'Int64',
                                        'aromatic_n': 'Int64',
                                        'aromatic_hetero': 'Int64',
                                       })
        
        print(read_file_name)
        print("{:5} file is ok: {}".format(file_type, read_file.equals(curr_file_copy)))
    print("----------")

In [77]:
file_type_list = []
if write_to_csv:
    file_type_list.append('csv')
if write_to_excel:
    file_type_list.append('xlsx')

# Some output was written
if len(file_type_list) > 0:
    # binding + functional
    if write_BF:
        print("Check BF subset")
        test_equality(df_combined_BF, name_BF, 'BF', file_type_list if write_BF_success else file_type_list[:-1])
        test_equality(df_combined_BF_enough_cpds, name_BF_100, 'BF', file_type_list if write_BF_success else file_type_list[:-1])
        test_equality(df_combined_BF_c_dt_d_dt, name_BF_100_c_dt_d_dt, 'BF', file_type_list if write_BF_success else file_type_list[:-1])
        test_equality(df_combined_BF_d_dt, name_BF_100_d_dt, 'BF', file_type_list if write_BF_success else file_type_list[:-1])

    # binding only
    if write_B:
        print("Check B subset")
        test_equality(df_combined_B, name_B, 'B', file_type_list if write_B_success else file_type_list[:-1])
        test_equality(df_combined_B_enough_cpds, name_B_100, 'B', file_type_list if write_B_success else file_type_list[:-1])
        test_equality(df_combined_B_c_dt_d_dt, name_B_100_c_dt_d_dt, 'B', file_type_list if write_B_success else file_type_list[:-1])
        test_equality(df_combined_B_d_dt, name_B_100_d_dt, 'B', file_type_list if write_B_success else file_type_list[:-1])

    # full dataset
    if write_full_dataset:
        print("Check full dataset")
        test_equality(df_combined, name_all, 'all', file_type_list if write_full_success else file_type_list[:-1])

    # split by target class
    if write_target_class_split:
        print("Check target class split")
        test_equality(dataset_0, name_membrane_kinase_protease, 'all', file_type_list if write_target_class_success else file_type_list[:-1])
        test_equality(dataset_1, name_other, 'all', file_type_list if write_target_class_success else file_type_list[:-1])

Check full dataset
/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/results/ChEMBL32_CTI_all
csv   file is ok: True
/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/results/ChEMBL32_CTI_all
xlsx  file is ok: True
----------


Test whether boths subsets combined are equal to the full dataset.

In [78]:
def test_dataset_split(current_file, read_file_names, assay_type, file_type_list):
    curr_file_copy = current_file.copy().sort_values(by=['cpd_target_pair_mutation']).reset_index(drop=True)
    
    for file_type in file_type_list:
        files = []
        for read_file_name in read_file_names:
            if file_type == 'csv':
                try:
                    read_file = pd.read_csv(read_file_name+".csv", sep = ";", 
                       dtype={'mutation': 'str', 
                              'tid_mutation': 'str', 
                              'atc_level1': 'str', 
                              'target_class_l2': 'str', 
                              'ro3_pass': 'str', 
                              'molecular_species': 'str', 
                              'full_molformula': 'str', 
                              'standard_inchi': 'str', 
                              'standard_inchi_key': 'str', 
                              'canonical_smiles': 'str', 
                              'scaffold_w_stereo': 'str', 
                              'scaffold_wo_stereo': 'str', 
                             })
                except FileNotFoundError:
                    print(read_file_name+".csv not found")
                    continue
            elif file_type == 'xlsx':
                try:
                    read_file = pd.read_excel(read_file_name+".xlsx")
                except FileNotFoundError:
                    print(read_file_name+".xlsx not found")
                    continue

            if assay_type == 'BF' or assay_type == 'all':
                read_file = read_file.astype({'first_publication_cpd_target_pair_BF': 'Int64',
                                            'first_publication_cpd_target_pair_w_pchembl_BF': 'Int64',
                                           })
            if assay_type == 'B' or assay_type == 'all':
                read_file = read_file.astype({'first_publication_cpd_target_pair_B': 'Int64',
                                            'first_publication_cpd_target_pair_w_pchembl_B': 'Int64',
                                           })
            read_file = read_file.astype({'first_approval': 'Int64',
                                        'usan_year': 'Int64',
                                        'first_publication_cpd': 'Int64',
                                        'hba': 'Int64',
                                        'hbd': 'Int64',
                                        'rtb': 'Int64',
                                        'num_ro5_violations': 'Int64',
                                        'aromatic_rings': 'Int64',
                                        'heavy_atoms': 'Int64',
                                        'hba_lipinski': 'Int64',
                                        'hbd_lipinski': 'Int64',
                                        'num_lipinski_ro5_violations': 'Int64',
                                       })
            if calculate_RDKit:
                read_file = read_file.astype({'num_aliphatic_carbocycles': 'Int64',
                                            'num_aliphatic_heterocycles': 'Int64',
                                            'num_aliphatic_rings': 'Int64',
                                            'num_aromatic_carbocycles': 'Int64',
                                            'num_aromatic_heterocycles': 'Int64',
                                            'num_aromatic_rings': 'Int64',
                                            'num_heteroatoms': 'Int64',
                                            'num_saturated_carbocycles': 'Int64',
                                            'num_saturated_heterocycles': 'Int64',
                                            'num_saturated_rings': 'Int64',
                                            'ring_count': 'Int64',
                                            'num_stereocentres': 'Int64',
                                            'aromatic_atoms': 'Int64',
                                            'aromatic_c': 'Int64',
                                            'aromatic_n': 'Int64',
                                            'aromatic_hetero': 'Int64',
                                           })

            files.append(read_file)
        
        readfiles = pd.concat(files)
        readfiles = readfiles.sort_values(by=['cpd_target_pair_mutation']).reset_index(drop=True)
        
        for read_file_name in read_file_names:
            print(read_file_name)
        print("{:5} file is ok: {}".format(file_type, readfiles.equals(curr_file_copy)))
    print("----------")

In [79]:
file_type_list = []
if write_to_csv:
    file_type_list.append('csv')
if write_to_excel:
    file_type_list.append('xlsx')

if write_target_class_split and len(file_type_list) > 0:
    test_dataset_split(df_combined, [name_membrane_kinase_protease, name_other], 'all', file_type_list if write_target_class_success else file_type_list[:-1])

In [80]:
# Binding+Functional contains Binding and additional rows
print(set(df_combined[df_combined['BF_100'] == True]['B_100']))
# The Binding Dataset is a subset of Binding+Functional:
print(set(df_combined[df_combined['B_100'] == True]['BF_100']))

{False, True}
{True}


# Testing: Overview of Dataset Sizes at Different Points in the Pipeline

In [81]:
############### TESTING: development of the full dataset size ###############
print("Size of full dataset at different points.")
pd.DataFrame(all_lengths,
                   columns=['type', 
                            '#mols', '#drugs', 
                            '#targets', '#drug_ targets', 
                            '#targets_ mutation', '#drug_ targets_mutation', 
                            '#cpd_tid_ pairs', '#drug_tid_ pairs',
                            '#cpd_ tid_mutation_ pairs', '#drug_ tid_mutation_ pairs'])

Size of full dataset at different points.


Unnamed: 0,type,#mols,#drugs,#targets,#drug_ targets,#targets_ mutation,#drug_ targets_mutation,#cpd_tid_ pairs,#drug_tid_ pairs,#cpd_ tid_mutation_ pairs,#drug_ tid_mutation_ pairs
0,init,580803,1389,6360,2323,7692,2706,983678,11888,996239,12948
1,pre dm table,578097,1303,6288,2275,7606,2657,961724,10859,973836,11918
2,pre DTI,580924,2158,6911,1031,8231,1243,970558,5786,982685,6300
3,post DTI,404128,2158,1744,1031,2633,1243,617561,5786,627956,6300
4,cpd props,404128,2158,1744,1031,2633,1243,617561,5786,627956,6300
5,all assays,402282,1740,1398,845,2287,1057,614594,5109,624989,5623
6,"all, >= 100",393191,1328,644,383,669,405,595438,2639,600160,2743
7,"all, >= 100, c_dt and d_dt",384450,1328,605,383,629,405,583398,2639,588120,2743
8,"all, >= 100, d_dt",278358,1328,383,383,405,405,431589,2639,436053,2743
9,binding,367324,1740,1398,845,2200,1052,565054,5109,575027,5618


In [82]:
############### TESTING: development of the dataset size (pchembl values required) ###############
print("Size of dataset with any pchembl values at different points.")
print("This includes data for which we only have pchembl data for functional assays but not for binding assays.")
pd.DataFrame(all_lengths_pchembl,
                   columns=['type', 
                            '#mols', '#drugs', 
                            '#targets', '#drug_ targets', 
                            '#targets_ mutation', '#drug_ targets_mutation', 
                            '#cpd_tid_ pairs', '#drug_tid_ pairs',
                            '#cpd_ tid_mutation_ pairs', '#drug_ tid_mutation_ pairs'])

Size of dataset with any pchembl values at different points.
This includes data for which we only have pchembl data for functional assays but not for binding assays.


Unnamed: 0,type,#mols,#drugs,#targets,#drug_ targets,#targets_ mutation,#drug_ targets_mutation,#cpd_tid_ pairs,#drug_tid_ pairs,#cpd_ tid_mutation_ pairs,#drug_ tid_mutation_ pairs
0,init,580803,1389,6360,2323,7692,2706,983678,11888,996239,12948
1,pre dm table,578097,1303,6288,2275,7606,2657,961724,10859,973836,11918
2,pre DTI,578097,773,6288,412,7606,623,961724,1357,973836,1869
3,post DTI,401075,773,1121,412,2008,623,608727,1357,619107,1869
4,cpd props,401075,773,1121,412,2008,623,608727,1357,619107,1869
5,all assays,400843,769,1121,412,2008,623,608326,1352,618706,1864
6,"all, >= 100",392145,719,644,315,669,337,592740,1215,597458,1318
7,"all, >= 100, c_dt and d_dt",383404,719,605,315,629,337,580700,1215,585418,1318
8,"all, >= 100, d_dt",277384,719,383,315,405,337,429133,1215,433593,1318
9,binding,365814,729,1113,403,1913,609,558669,1291,568627,1798


Print detailed stats.

In [83]:
def get_stats(data, column):
    print(column)
    print(f"{'Full dataset:' : <40} {len(set(data[column]))}")
    print(f"{'Comparators' : <40} {len(set(data[data['DTI'].isin(['DT'])][column]))}")
    print(f"{'Drugs' : <40} {len(set(data[data['DTI'] == 'D_DT'][column]))}")
    print(f"{'Clinical Candidates' : <40} {len(set(data[data['DTI'].isin(['C0_DT', 'C1_DT', 'C2_DT', 'C3_DT'])][column]))}")
    print(f"{'Clinical Candidates, max_phase = 3' : <40} {len(set(data[data['DTI'] == 'C3_DT'][column]))}")
    print(f"{'Clinical Candidates, max_phase = 2' : <40} {len(set(data[data['DTI'] == 'C2_DT'][column]))}")
    print(f"{'Clinical Candidates, max_phase = 1' : <40} {len(set(data[data['DTI'] == 'C1_DT'][column]))}")
    print(f"{'Clinical Candidates, max_phase < 1' : <40} {len(set(data[data['DTI'] == 'C0_DT'][column]))}")
    print()

In [84]:
get_stats(df_combined_BF, "parent_molregno")
get_stats(df_combined_BF, "tid")
get_stats(df_combined_BF, "tid_mutation")
get_stats(df_combined_BF, "cpd_target_pair")
get_stats(df_combined_BF, "cpd_target_pair_mutation")

parent_molregno
Full dataset:                            402282
Comparators                              400167
Drugs                                    1740
Clinical Candidates                      1578
Clinical Candidates, max_phase = 3       536
Clinical Candidates, max_phase = 2       837
Clinical Candidates, max_phase = 1       179
Clinical Candidates, max_phase < 1       26

tid
Full dataset:                            1398
Comparators                              1117
Drugs                                    845
Clinical Candidates                      945
Clinical Candidates, max_phase = 3       563
Clinical Candidates, max_phase = 2       713
Clinical Candidates, max_phase = 1       261
Clinical Candidates, max_phase < 1       58

tid_mutation
Full dataset:                            2287
Comparators                              1943
Drugs                                    1057
Clinical Candidates                      1138
Clinical Candidates, max_phase = 3       648
Clinical