# Notebook to extract and curate ChEMBL data for the Leeson data set (drug-target interactions)

### Authors: Barbara Zdrazil, Lina Heinzke
### 10/2022

**This notebook extracts data from ChEMBL and performs some curation steps in order to retrieve a data set for drug-target, and clinical candidate-target associations including comparator compounds.**

**The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach.**

**More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes**


In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Scaffolds import MurckoScaffold

#### notebook settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

output_missing_things_1 = False
output_missing_things_2 = False

# Get data from ChEMBL

In [2]:
# # @Barbara: uncomment and modify this to your preferred paths
# path_results = "/Users/bzdrazil/Dropbox/ChEMBL/NP/data/"
# path_sqlite3_database = <your sqlite database location>

chembl_version = "26"
base_path = "/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/"
path_results = base_path+"results/"
path_sqlite3_database = base_path+"data/chembl_"+chembl_version+"/chembl_"+chembl_version+"_sqlite/chembl_"+chembl_version+".db"

In [3]:
# # @Barbara: Accessing ChEMBL using Oracle
# # If you want to use this option, you have to change the sql statements to the commented line:
# # from docs -> from chembl_31.docs

# import cx_Oracle

# #cx_Oracle.init_oracle_client(lib_dir="/Users/bzdrazil/Downloads/instantclient_19_8")  #https://www.oracle.com/in/database/technologies/instant-client/macos-intel-x86-downloads.html; https://stackoverflow.com/questions/56119490/cx-oracle-error-dpi-1047-cannot-locate-a-64-bit-oracle-client-library
# cx_Oracle.clientversion() 

# import pandas as pd
# import sqlalchemy as sa
# import requests

# workdir = '/Users/bzdrazil/Desktop/'

# chemdev2 = 'oracle://{}:{}@ora-dlvm-103.ebi.ac.uk:1521/?service_name=chemdev2'.format('user', 'pw') # insert your username and password
# engine_ch = sa.create_engine(chemdev2)

In [4]:
import sqlite3

engine_ch = sqlite3.connect(path_sqlite3_database)

In [5]:
sql = '''
SELECT act.molregno, act.pchembl_value, act.standard_type, 
    ass.assay_type, ass.tid, 
    vs.mutation,
    td.pref_name as target_pref_name, td.target_type, td.organism, td.chembl_id as target_chembl_id,
    docs.year, docs.journal
FROM activities act
INNER JOIN assays ass 
    on  act.assay_id = ass.assay_id
LEFT JOIN variant_sequences vs
    on ass.variant_id = vs.variant_id
INNER JOIN target_dictionary td
    on ass.tid = td.tid
INNER JOIN docs
    on act.doc_id = docs.doc_id
WHERE act.potential_duplicate = 0
    and act.standard_relation = '='
    and data_validity_comment is null
    and td.tid <>22226   ----exclude unchecked targets
    and td.target_type like '%PROTEIN%'
    -- and ass.assay_type = 'B' -- only binding assays, will be taken care of later
'''

df_mols = pd.read_sql_query(sql, con=engine_ch)
df_mols['tid_mutation'] = np.where(df_mols['mutation'].notnull(), 
                                   df_mols['tid'].astype('str')+'-'+df_mols['mutation'], 
                                   df_mols['tid'].astype('str'))
df_mols

Unnamed: 0,molregno,pchembl_value,standard_type,assay_type,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,year,journal,tid_mutation
0,252199,5.40,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483
1,253534,4.77,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483
2,253199,6.75,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483
3,253199,5.22,IC50,A,12594,,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,2004.0,Bioorg. Med. Chem. Lett.,12594
4,253199,4.43,IC50,A,17045,,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,2004.0,Bioorg. Med. Chem. Lett.,17045
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3343627,2317951,7.89,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134
3343628,2325859,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522
3343629,198115,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522
3343630,2317531,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522


In [6]:
# salt and parent info are queried for molregno, chemblid and pref_name
# compound info about max_phase, usan_year, first_approval, prodrug, oral, parenteral, topical and black_box_warning
# are based on the *parent*

sql = """
SELECT DISTINCT md2.molregno as parent_molregno, md2.chembl_id as parent_chemblid, md2.pref_name as parent_pref_name
    , md.molregno as salt_molregno, md.chembl_id as salt_chemblid, md.pref_name as salt_pref_name
    -- optional: query salt-based information
    -- , md.max_phase as salt_max_phase, md.usan_year as salt_usan_year, md.first_approval as salt_first_approval
    -- , md.prodrug as salt_prodrug, md.oral as salt_oral, md.parenteral as salt_parenteral, md.topical as salt_topical
    -- , md.black_box_warning as salt_black_box_warning
    , md2.max_phase, md2.usan_year, md2.first_approval
    , md2.prodrug, md2.oral, md2.parenteral, md2.topical, md2.black_box_warning
--First join parent_cmpds and salt_cmpds (ie their children)
FROM molecule_dictionary md                                        --salt_molregno
JOIN molecule_hierarchy mh 
    ON md.molregno = mh.molregno
JOIN molecule_dictionary md2 
    ON mh.parent_molregno = md2.molregno                           --parent_molregno
"""
df_md_info = pd.read_sql_query(sql, engine_ch)

In [7]:
df_mols = df_mols.merge(df_md_info, left_on = 'molregno', right_on = 'salt_molregno', how = 'left')
df_mols

Unnamed: 0,molregno,pchembl_value,standard_type,assay_type,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,year,journal,tid_mutation,parent_molregno,parent_chemblid,parent_pref_name,salt_molregno,salt_chemblid,salt_pref_name,max_phase,usan_year,first_approval,prodrug,oral,parenteral,topical,black_box_warning
0,252199,5.40,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,252199,CHEMBL357278,,252199,CHEMBL357278,,0,,,-1,0,0,0,0
1,253534,4.77,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,253534,CHEMBL357119,,253534,CHEMBL357119,,0,,,-1,0,0,0,0
2,253199,6.75,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,253199,CHEMBL152968,,253199,CHEMBL152968,,0,,,-1,0,0,0,0
3,253199,5.22,IC50,A,12594,,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,2004.0,Bioorg. Med. Chem. Lett.,12594,253199,CHEMBL152968,,253199,CHEMBL152968,,0,,,-1,0,0,0,0
4,253199,4.43,IC50,A,17045,,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,2004.0,Bioorg. Med. Chem. Lett.,17045,253199,CHEMBL152968,,253199,CHEMBL152968,,0,,,-1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3343627,2317951,7.89,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134,2317951,CHEMBL4278500,,2317951,CHEMBL4278500,,0,,,-1,0,0,0,0
3343628,2325859,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,2325859,CHEMBL4286411,,2325859,CHEMBL4286411,,0,,,-1,0,0,0,0
3343629,198115,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,198115,CHEMBL120632,TETRAGASTRIN,198115,CHEMBL120632,TETRAGASTRIN,0,,,0,0,0,0,0
3343630,2317531,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,2317531,CHEMBL4278080,,2317531,CHEMBL4278080,,0,,,-1,0,0,0,0


In [8]:
sql = """
SELECT DISTINCT docs.year, cr.molregno
FROM docs
LEFT JOIN compound_records cr
    ON docs.doc_id = cr.doc_id
"""

df_docs = pd.read_sql_query(sql, con=engine_ch)
df_docs = df_docs.dropna()
df_docs = df_docs.astype({'year': 'Int64', 'molregno': 'Int64'})
df_docs = df_docs.merge(df_md_info[['salt_molregno', 'parent_molregno']], left_on = 'molregno', right_on = 'salt_molregno', how='left')
df_docs['first_publication_cmpd'] = df_docs.groupby('parent_molregno')['year'].transform('min')
df_docs = df_docs[['parent_molregno', 'first_publication_cmpd']].drop_duplicates()
df_docs

Unnamed: 0,parent_molregno,first_publication_cmpd
0,4941,1974
1,921,1974
2,1005421,1976
3,1750777,1976
4,1750778,1976
...,...,...
1489815,2329285,2018
1489816,2317951,2018
1489817,2325859,2018
1489819,2317531,2018


In [9]:
df_mols = df_mols.merge(df_docs, on = 'parent_molregno', how='left')
df_mols

Unnamed: 0,molregno,pchembl_value,standard_type,assay_type,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,year,journal,tid_mutation,parent_molregno,parent_chemblid,parent_pref_name,salt_molregno,salt_chemblid,salt_pref_name,max_phase,usan_year,first_approval,prodrug,oral,parenteral,topical,black_box_warning,first_publication_cmpd
0,252199,5.40,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,252199,CHEMBL357278,,252199,CHEMBL357278,,0,,,-1,0,0,0,0,2004
1,253534,4.77,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,253534,CHEMBL357119,,253534,CHEMBL357119,,0,,,-1,0,0,0,0,2004
2,253199,6.75,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,253199,CHEMBL152968,,253199,CHEMBL152968,,0,,,-1,0,0,0,0,2004
3,253199,5.22,IC50,A,12594,,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,2004.0,Bioorg. Med. Chem. Lett.,12594,253199,CHEMBL152968,,253199,CHEMBL152968,,0,,,-1,0,0,0,0,2004
4,253199,4.43,IC50,A,17045,,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,2004.0,Bioorg. Med. Chem. Lett.,17045,253199,CHEMBL152968,,253199,CHEMBL152968,,0,,,-1,0,0,0,0,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3343627,2317951,7.89,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134,2317951,CHEMBL4278500,,2317951,CHEMBL4278500,,0,,,-1,0,0,0,0,2018
3343628,2325859,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,2325859,CHEMBL4286411,,2325859,CHEMBL4286411,,0,,,-1,0,0,0,0,2018
3343629,198115,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,198115,CHEMBL120632,TETRAGASTRIN,198115,CHEMBL120632,TETRAGASTRIN,0,,,0,0,0,0,0,1987
3343630,2317531,,Bmax,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,2317531,CHEMBL4278080,,2317531,CHEMBL4278080,,0,,,-1,0,0,0,0,2018


In [10]:
df_mols = df_mols.astype({
    'year': 'Int64',
    'first_approval': 'Int64'
})

In [11]:
# df_mols.to_csv(path_results+"ChEMBL"+chembl_version+"_initial_query.csv", sep = ';', index = False)

In [12]:
############### TESTING: method to print comparison to original dataset ###############
all_length_comp = []
all_length_comp_d_dt_pchembl = []

def calculate_dataset_sizes(data):
    now_mols = set(data["parent_molregno"]) 
    now_targets = set(data["tid_mutation"]) 
    now_pairs = set(data['parent_molregno_tid_mutation']) 
    
    if 'DTI' in data.columns:
        now_drugs = set(data[data["DTI"] == "D_DT"]["parent_molregno"]) 
        now_drug_targets = set(data[data["DTI"] == "D_DT"]["tid_mutation"]) 
        now_drug_pairs = set(data[data["DTI"] == "D_DT"]['parent_molregno_tid_mutation'])
    else: 
        now_drugs = set(data[data["max_phase"] == 4]["parent_molregno"]) 
        now_drug_targets = set(data[data["max_phase"] == 4]["tid_mutation"]) 
        now_drug_pairs = set(data[data["max_phase"] == 4]['parent_molregno_tid_mutation'])

    return [len(now_mols), len(now_drugs), len(now_targets), len(now_drug_targets), len(now_pairs), len(now_drug_pairs)]

def add_dataset_sizes(data, label, output=False):
    data_test = data.copy()
    data_test['parent_molregno_tid_mutation'] = data_test.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1)
    
    all_length_comp.append([label] + calculate_dataset_sizes(data_test))
    
    # only data with pchembl value
    if 'pchembl_value' in data_test.columns:
        data_pchembl = data_test[~data_test['pchembl_value'].isnull()]
    else:
        data_pchembl = data_test[~data_test['pchembl_value_mean'].isnull()]
    all_length_comp_d_dt_pchembl.append([label] + calculate_dataset_sizes(data_pchembl))

In [13]:
############### TESTING: initial query ###############
add_dataset_sizes(df_mols, "init", True)

# Calculate mean, median, and max pChEMBL values for each compound-target pair

In [14]:
# summarise pchembl values into mean, max, median and year into first publication
df_mols_all = df_mols[(df_mols['assay_type'] == 'B') | (df_mols['assay_type'] == 'F')].copy()
df_mols_all['pchembl_value_mean'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
df_mols_all['pchembl_value_max'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
df_mols_all['pchembl_value_median'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
df_mols_all['first_publication_target_cmpd_pair'] = df_mols_all.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')
df_mols_all_first_publication_pchembl = df_mols_all[~df_mols_all['pchembl_value'].isnull()].groupby(['parent_molregno', 'tid_mutation'])['year'].min().reset_index().rename(columns={"year": "first_publication_target_cmpd_pair_w_pchembl"})
df_mols_all = df_mols_all.merge(df_mols_all_first_publication_pchembl, on=['parent_molregno', 'tid_mutation'], how='left')

In [15]:
# repeat based on the data of only the binding assays
df_mols_binding = df_mols[df_mols['assay_type'] == 'B'].copy()
df_mols_binding['pchembl_value_mean'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
df_mols_binding['pchembl_value_max'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
df_mols_binding['pchembl_value_median'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
df_mols_binding['first_publication_target_cmpd_pair'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')
df_mols_binding_first_publication_pchembl = df_mols_binding[~df_mols_binding['pchembl_value'].isnull()].groupby(['parent_molregno', 'tid_mutation'])['year'].min().reset_index().rename(columns={"year": "first_publication_target_cmpd_pair_w_pchembl"})
df_mols_binding = df_mols_binding.merge(df_mols_binding_first_publication_pchembl, on=['parent_molregno', 'tid_mutation'], how='left')

In [16]:
# table that has rows for pchembl_mean, max, median based on all assay data (only_binding = False)
# and rows for values based on only binding assays (only_binding = True)
df_mols_all['only_binding'] = False
df_mols_binding['only_binding'] = True
df_combined = pd.concat([df_mols_all, df_mols_binding])
# drop all salt related information
# as well as information for the aggregated values
df_combined = df_combined.drop(columns=['molregno', 'salt_molregno', 'salt_chemblid', 'salt_pref_name', 
                                        'year', 'journal', 'pchembl_value', 'standard_type', 'assay_type']).drop_duplicates()
df_combined

Unnamed: 0,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,tid_mutation,parent_molregno,parent_chemblid,parent_pref_name,max_phase,usan_year,first_approval,prodrug,oral,parenteral,topical,black_box_warning,first_publication_cmpd,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,first_publication_target_cmpd_pair_w_pchembl,only_binding
0,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,10483,252199,CHEMBL357278,,0,,,-1,0,0,0,0,2004,5.400,5.40,5.400,2004,2004,False
1,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,10483,253534,CHEMBL357119,,0,,,-1,0,0,0,0,2004,4.770,4.77,4.770,2004,2004,False
2,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,10483,253199,CHEMBL152968,,0,,,-1,0,0,0,0,2004,6.750,6.75,6.750,2004,2004,False
3,10989,,Carbonic anhydrase XIII,SINGLE PROTEIN,Mus musculus,CHEMBL2186,10989,933,CHEMBL268439,,0,,,-1,0,0,0,0,1999,8.700,8.70,8.700,2004,2004,False
4,105567,,Adenosine A1 receptor,SINGLE PROTEIN,Cavia porcellus,CHEMBL2304404,105567,606480,CHEMBL608018,,0,,,-1,0,0,0,0,2004,,,,2004,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1886001,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,134,2326437,CHEMBL4286989,,0,,,-1,0,0,0,0,2018,5.890,5.89,5.890,2018,2018,True
1886002,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,134,2321411,CHEMBL4281963,,0,,,-1,0,0,0,0,2018,9.400,9.70,9.400,2018,2018,True
1886006,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,134,2322887,CHEMBL4283439,,0,,,-1,0,0,0,0,2018,6.025,6.52,6.025,2018,2018,True
1886012,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,11522,2325859,CHEMBL4286411,,0,,,-1,0,0,0,0,2018,7.875,8.41,7.875,2018,2018,True


In [17]:
# Keep only compounds with pchembl_value or clinical compounds
# I.e. clinical compounds are not required to have a pchembl value
df_combined = df_combined[(~df_combined['pchembl_value_mean'].isnull()) | (df_combined['max_phase'] > 0)].copy()
df_combined

Unnamed: 0,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,tid_mutation,parent_molregno,parent_chemblid,parent_pref_name,max_phase,usan_year,first_approval,prodrug,oral,parenteral,topical,black_box_warning,first_publication_cmpd,pchembl_value_mean,pchembl_value_max,pchembl_value_median,first_publication_target_cmpd_pair,first_publication_target_cmpd_pair_w_pchembl,only_binding
0,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,10483,252199,CHEMBL357278,,0,,,-1,0,0,0,0,2004,5.400,5.40,5.400,2004,2004,False
1,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,10483,253534,CHEMBL357119,,0,,,-1,0,0,0,0,2004,4.770,4.77,4.770,2004,2004,False
2,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,10483,253199,CHEMBL152968,,0,,,-1,0,0,0,0,2004,6.750,6.75,6.750,2004,2004,False
3,10989,,Carbonic anhydrase XIII,SINGLE PROTEIN,Mus musculus,CHEMBL2186,10989,933,CHEMBL268439,,0,,,-1,0,0,0,0,1999,8.700,8.70,8.700,2004,2004,False
9,11643,,DNA topoisomerase III,SINGLE PROTEIN,Bacillus subtilis (strain 168),CHEMBL4320,11643,82960,CHEMBL54530,,0,,,-1,0,0,0,0,1980,4.720,4.72,4.720,1984,1984,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1886001,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,134,2326437,CHEMBL4286989,,0,,,-1,0,0,0,0,2018,5.890,5.89,5.890,2018,2018,True
1886002,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,134,2321411,CHEMBL4281963,,0,,,-1,0,0,0,0,2018,9.400,9.70,9.400,2018,2018,True
1886006,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,134,2322887,CHEMBL4283439,,0,,,-1,0,0,0,0,2018,6.025,6.52,6.025,2018,2018,True
1886012,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,11522,2325859,CHEMBL4286411,,0,,,-1,0,0,0,0,2018,7.875,8.41,7.875,2018,2018,True


# Extract drug-target interactions with disease relevance from drug_mechanism table

In [18]:
sql_dti = '''
SELECT DISTINCT mh.parent_molregno, dm.tid, dm.disease_efficacy
FROM drug_mechanism dm
INNER JOIN molecule_hierarchy mh
    on dm.molregno = mh.molregno
INNER JOIN molecule_dictionary md
    on mh.parent_molregno = md.molregno
'''

df_dti = pd.read_sql_query(sql_dti, con=engine_ch)
# drop rows that don't have a tid
df_dti = df_dti.dropna(subset = ['tid'])
df_dti['tid'] = df_dti['tid'].astype('Int64')
df_dti

Unnamed: 0,parent_molregno,tid,disease_efficacy
0,1124,11060,1
1,675068,10193,1
2,1125,10193,1
3,1085,10193,1
4,1124,10193,1
...,...,...,...
4756,1304559,101019,1
4757,1304559,100417,1
4758,2336099,11540,1
4759,2146132,100097,1


In [19]:
# disease_efficacy NUMBER
# Flag to show whether the target assigned is believed to play a role in the efficacy of the drug in the indication(s)
# for which it is approved (1 = yes, 0 = no)
df_dti = df_dti[df_dti['disease_efficacy'] == 1].copy()

## Add single proteins for protein families, protein complexes, protein complex groups
## Query protein mappings

In [20]:
sql_dti = '''
SELECT tr.tid, tr.relationship, tr.related_tid, 
    td1.pref_name as pref_name_1, td1.target_type as target_type_1, td1.organism as organism_1, 
    td2.pref_name as pref_name_2, td2.target_type as target_type_2, td2.organism as organism_2, td2.chembl_id as Target_chembl_id_2 
FROM target_relations tr
INNER JOIN target_dictionary td1
    on tr.tid = td1.tid
INNER JOIN target_dictionary td2
    on tr.related_tid = td2.tid
'''

df_related_targets = pd.read_sql_query(sql_dti, con=engine_ch)
df_related_targets.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2,Target_chembl_id_2
0,10193,SUBSET OF,104764,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,Carbonic anhydrase,PROTEIN FAMILY,Homo sapiens,CHEMBL2095180
1,12071,SUBSET OF,109746,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase,PROTEIN FAMILY,Homo sapiens,CHEMBL3559691
2,12071,SUBSET OF,104709,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase 1/cyclin B,PROTEIN COMPLEX,Homo sapiens,CHEMBL2094127
3,12071,SUBSET OF,107893,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,CDK1/Cyclin A,PROTEIN COMPLEX,Homo sapiens,CHEMBL3038467
4,12071,SUBSET OF,117095,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase 1/G1/S-specific cyclin-D1,PROTEIN COMPLEX,Homo sapiens,CHEMBL3885551


In [21]:
protein_complex_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_family_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN FAMILY") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_group_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX GROUP") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

# TODO: should these be included? which direction (how to avoid duplications)?
single_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "SINGLE PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "EQUIVALENT TO")]

# TODO: should these be included?
chimeric_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "CHIMERIC PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

# TODO: should these be included?
ppi_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN-PROTEIN INTERACTION") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

In [22]:
relevant_mappings = pd.concat([protein_complex_mapping, 
                               protein_family_mapping, 
                               protein_complex_group_mapping,
                               single_protein_mapping, 
                               chimeric_protein_mapping, 
                               ppi_mapping, 
                               ])
relevant_mappings['tid'] = relevant_mappings['tid'].astype('Int64')
relevant_mappings.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2,Target_chembl_id_2
249,104282,SUPERSET OF,10819,Acetylcholine receptor; alpha1/beta1/delta/gamma,PROTEIN COMPLEX,Homo sapiens,Acetylcholine receptor protein alpha chain,SINGLE PROTEIN,Homo sapiens,CHEMBL4808
251,104282,SUPERSET OF,12715,Acetylcholine receptor; alpha1/beta1/delta/gamma,PROTEIN COMPLEX,Homo sapiens,Acetylcholine receptor protein delta chain,SINGLE PROTEIN,Homo sapiens,CHEMBL3011
260,104284,SUPERSET OF,10023,Neuronal acetylcholine receptor; alpha2/beta4,PROTEIN COMPLEX,Rattus norvegicus,Neuronal acetylcholine receptor protein alpha-2 subunit,SINGLE PROTEIN,Rattus norvegicus,CHEMBL2584
261,104284,SUPERSET OF,12717,Neuronal acetylcholine receptor; alpha2/beta4,PROTEIN COMPLEX,Rattus norvegicus,Neuronal acetylcholine receptor subunit beta-4,SINGLE PROTEIN,Rattus norvegicus,CHEMBL2658
275,104685,SUPERSET OF,49,Integrin alpha-IIb/beta-3,PROTEIN COMPLEX,Homo sapiens,Integrin alpha-IIb,SINGLE PROTEIN,Homo sapiens,CHEMBL212


In [23]:
# DT-interactions and targets based on drug_mechanisms table
DTIs_original = set(df_dti.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
dti_tids_original = set(df_dti['tid'])

# DT-interactions and targets with other target IDs based on mapping
df_dti_add_targets = df_dti.merge(relevant_mappings, on = 'tid', how = 'inner')
DTIs_mapped = set(df_dti_add_targets.agg('{0[parent_molregno]}_{0[related_tid]}'.format, axis=1))
dti_tids_mapped = set(df_dti_add_targets['related_tid'].astype("int"))

# combined DT-interactions and targets
DTIs_set = DTIs_original.union(DTIs_mapped)
dti_tids_set = dti_tids_original.union(dti_tids_mapped)

# DTI classification

Identify which TIDs are drug targets from the drug_mechanism table and add field called "defined_DTI": 
    Value: "True" if it is a drug with a curated drug_meachnism and "False" if not.

Map again to the drug mechanism table via "tid" to identify therapeutic targets and add column "therapeutic_target":
    Value: "True" | "False"

Use this logic to define per compound/target pair whether it corresponds to:
    
    drug and its therapeutic target "DTI": "D_DT";
        
    a drug target but not a drug "DTI": "DT";
        
    not a drug and not a drug target "DTI": "NDT";
        
(done by first creating a new column "DT_assoc" in both the master and the mapping table which reflects the molregno-tid association and then mapping it)

In [24]:
# drugs_set = set(df_combined[df_combined['max_phase'] == 4]["parent_molregno"])
# df_combined['is_drug'] = df_combined['parent_molregno'].isin(drugs_set)
df_combined['therapeutic_target'] = df_combined['tid'].isin(dti_tids_set)
df_combined['DT_assoc'] = df_combined.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1)

In [25]:
df_combined['DTI'] = "Nan"
df_combined.loc[(df_combined['DT_assoc'].isin(DTIs_set) & (df_combined['max_phase'] == 4)), 'DTI'] = "D_DT"
df_combined.loc[(df_combined['DT_assoc'].isin(DTIs_set) & (df_combined['max_phase'] == 3)), 'DTI'] = "C3_DT"
df_combined.loc[(df_combined['DT_assoc'].isin(DTIs_set) & (df_combined['max_phase'] == 2)), 'DTI'] = "C2_DT"
df_combined.loc[(df_combined['DT_assoc'].isin(DTIs_set) & (df_combined['max_phase'] == 1)), 'DTI'] = "C1_DT"
df_combined.loc[(df_combined['DT_assoc'].isin(DTIs_set) & (df_combined['max_phase'] == 0)), 'DTI'] = "C0_DT"
df_combined.loc[((~df_combined['DT_assoc'].isin(DTIs_set)) 
                 & (df_combined['therapeutic_target'] == True)), 'DTI'] = "DT"
# if target is not a therapeutic target, 'DT_assoc' cannot be in DTIs_set
# (~df_combined['DT_assoc'].isin(DTIs_set)) is included for clarity
df_combined.loc[((~df_combined['DT_assoc'].isin(DTIs_set)) 
                 & (df_combined['therapeutic_target'] == False)), 'DTI'] = "NDT"

In [26]:
############### TESTING: before reducing to D_DT and DT ###############
add_dataset_sizes(df_combined, "pre DTI")

In [27]:
# keep only D_DT and DT
df_combined = df_combined[(df_combined['DTI'].isin(['D_DT', 'C3_DT', 'C2_DT', 'C1_DT', 'C0_DT', 'DT']))]

In [28]:
############### TESTING: after reducing to D_DT and DT ###############
add_dataset_sizes(df_combined, "post DTI")

# Add compound properties

In [29]:
sql_cpd_props = '''

SELECT mh.parent_molregno, 
    cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
    cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
    cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
    cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, 
    struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
FROM compound_properties cp, molecule_hierarchy mh, compound_structures struct
-- FROM chembl_31.compound_properties cp, chembl_31.molecule_hierarchy mh, chembl_31.compound_structures struct
WHERE cp.molregno=mh.parent_molregno
    and struct.molregno=mh.parent_molregno

'''

df_cpd_props = pd.read_sql_query(sql_cpd_props, con=engine_ch)
df_cpd_props.head()

Unnamed: 0,parent_molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,1,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.48,,3.63,2.69,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-11(21-17(24)20-15(22)9-19-21)6-7-12(10)16(23)13-4-2-3-5-14(13)18/h...,OWRSAHYFSSNENM-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,2,332.32,1.33,6.0,1.0,108.61,3.0,N,0.0,6.33,,2.88,1.82,ACID,332.32,3.0,25.0,0.73,332.0909,C18H12N4O3,7.0,1.0,0.0,InChI=1S/C18H12N4O3/c1-11-8-14(22-18(25)21-16(23)10-20-22)6-7-15(11)17(24)13-4-2-12(9-19)3-5-13/...,ZJYUMURGSZQFMH-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,3,357.8,2.27,5.0,2.0,87.98,3.0,N,0.0,6.33,,3.7,2.64,ACID,357.8,3.0,25.0,0.75,357.088,C18H16ClN3O3,6.0,2.0,0.0,InChI=1S/C18H16ClN3O3/c1-10-7-14(22-18(25)21-15(23)9-20-22)8-11(2)16(10)17(24)12-3-5-13(19)6-4-1...,YOMWDCALSDWFSV-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,4,307.31,1.46,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.02,1.97,ACID,307.31,3.0,23.0,0.74,307.0957,C17H13N3O3,6.0,1.0,0.0,InChI=1S/C17H13N3O3/c1-11-2-4-12(5-3-11)16(22)13-6-8-14(9-7-13)20-17(23)19-15(21)10-18-20/h2-10H...,PSOPUAQFGCRDIP-UHFFFAOYSA-N,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,5,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.63,2.57,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-13(21-17(24)20-15(22)9-19-21)6-7-14(10)16(23)11-2-4-12(18)5-3-11/h...,KEZNSCMBVRNOHO-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


In [30]:
df_cpd_props_unique = df_cpd_props.drop_duplicates()

In [31]:
# df_cpd_props_unique.to_csv(path_results+"ChEMBL"+chembl_version+"_cpd_props.csv", sep = ';', index = False)

## Combine initial query with compound properties

In [32]:
df_combined = df_combined.merge(df_cpd_props_unique, on = 'parent_molregno', how = 'inner')

In [33]:
# df_combined.to_csv(path_results+"ChEMBL"+chembl_version+"_cpds_with_props.csv", sep = ';', index = False)

In [34]:
############### TESTING: compound props ###############
add_dataset_sizes(df_combined, "cpd props")

# Calculate LE metrics

Ligand Efficiency (LE):

$\text{LE} = \frac{\Delta\text{G}}{\text{HA}}$
where $ \Delta\text{G} = − RT \ln(K_d)$, $− RT\ln(K_i)$, or $− RT\ln(IC_{50})$

$\text{LE}=\frac{(2.303 \cdot 298 \cdot 0.00199 \cdot \text{pchembl_value})} {\text{heavy_atoms}}$


$\text{BEI}=\frac{\text{pchembl_mean} \cdot 1000} {\text{mw_freebase}}$

$\text{SEI}=\frac{\text{pchembl_mean} \cdot 100} {\text{PSA}}$

$\text{LLE}=\text{pchembl_mean}-\text{ALOGP}$

In [35]:
df_combined['LE'] = df_combined['pchembl_value_mean']/df_combined['heavy_atoms']*(2.303*298*0.00199)
df_combined['BEI'] = df_combined['pchembl_value_mean']*1000/df_combined["mw_freebase"]
df_combined['SEI'] = df_combined['pchembl_value_mean']*100/df_combined["psa"]
df_combined['LLE'] = df_combined['pchembl_value_mean']-df_combined["alogp"]

# Add Descriptors

In [36]:
# available descriptors
print([x[0] for x in Descriptors._descList])

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'Slo

In [37]:
# PandasTools.AddMoleculeColumnToFrame(df_combined,'canonical_smiles','mol',includeFingerprints=False)

# df_combined.loc[:,'FractionCSP3'] = df_combined['mol'].apply(Descriptors.FractionCSP3)
# df_combined.loc[:,'NumAliphaticCarbocycles'] = df_combined['mol'].apply(Descriptors.NumAliphaticCarbocycles)
# df_combined.loc[:,'NumAliphaticHeterocycles'] = df_combined['mol'].apply(Descriptors.NumAliphaticHeterocycles)
# df_combined.loc[:,'NumAliphaticRings'] = df_combined['mol'].apply(Descriptors.NumAliphaticRings)
# df_combined.loc[:,'NumAromaticCarbocycles'] = df_combined['mol'].apply(Descriptors.NumAromaticCarbocycles)
# df_combined.loc[:,'NumAromaticHeterocycles'] = df_combined['mol'].apply(Descriptors.NumAromaticHeterocycles)
# df_combined.loc[:,'NumAromaticRings'] = df_combined['mol'].apply(Descriptors.NumAromaticRings)
# df_combined.loc[:,'NumHeteroatoms'] = df_combined['mol'].apply(Descriptors.NumHeteroatoms)
# df_combined.loc[:,'NumSaturatedCarbocycles'] = df_combined['mol'].apply(Descriptors.NumSaturatedCarbocycles)
# df_combined.loc[:,'NumSaturatedHeterocycles'] = df_combined['mol'].apply(Descriptors.NumSaturatedHeterocycles)
# df_combined.loc[:,'NumSaturatedRings'] = df_combined['mol'].apply(Descriptors.NumSaturatedRings)
# df_combined.loc[:,'RingCount'] = df_combined['mol'].apply(Descriptors.RingCount)
# df_combined.loc[:,'NumStereocentres'] = df_combined['mol'].apply(Chem.rdMolDescriptors.CalcNumAtomStereoCenters)

In [38]:
# # drop the column with RDKit molecules
# df_combined = df_combined.drop(['mol'] , axis=1)
# len(df_combined)

# Descriptors for aromaticity

In [39]:
def calculate_aromatic_atoms(smiles_set):
    aromatic_atoms_dict = dict()
    aromatic_c_dict = dict()
    aromatic_n_dict = dict()
    aromatic_hetero_dict = dict()
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        aromatic_atoms_dict[smiles] = sum(mol.GetAtomWithIdx(i).GetIsAromatic() for i in range(mol.GetNumAtoms()))
        aromatic_c_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 6)) for i in range(mol.GetNumAtoms()))
        aromatic_n_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 7)) for i in range(mol.GetNumAtoms()))
        aromatic_hetero_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() != 6) & (mol.GetAtomWithIdx(i).GetAtomicNum() != 1)) for i in range(mol.GetNumAtoms()))
        
    return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict

In [40]:
# smiles_set = set(df_combined["canonical_smiles"])
# aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = calculate_aromatic_atoms(list(smiles_set))

# df_combined['aromatic_atoms'] = df_combined['canonical_smiles'].map(aromatic_atoms_dict)
# df_combined['aromatic_c'] = df_combined['canonical_smiles'].map(aromatic_c_dict)
# df_combined['aromatic_n'] = df_combined['canonical_smiles'].map(aromatic_n_dict)
# df_combined['aromatic_hetero'] = df_combined['canonical_smiles'].map(aromatic_hetero_dict)

In [41]:
# ############### TESTING: aromaticity counts in the old dataset vs. the new dataset ###############
# test = df_combined[['parent_molregno', 'canonical_smiles', 'aromatic_atoms', 'aromatic_c', 'aromatic_n', 'aromatic_hetero']].drop_duplicates()
# test = test.merge(df_comparison[['PARENT_MOLREGNO', 'AROMATIC_ATOMS', 'AROMATIC_C', 'AROMATIC_HETERO']], left_on='parent_molregno', right_on='PARENT_MOLREGNO')
# test = test.drop_duplicates()
# test

In [42]:
# ############### TESTING: aromaticity counts in the old dataset vs. the new dataset ###############
# # there are quite a few instances with differences between the old and the new dataset
# # however, based on validating a few examples, the new counts seem to be more accurate
# test[test['aromatic_atoms'] != test['AROMATIC_ATOMS']]

# ATC level 1

In [43]:
sql = """
SELECT atc.*, matc.*
FROM atc_classification atc
INNER JOIN molecule_atc_classification matc
    ON atc.level5 = matc.level5
"""

atc_levels = pd.read_sql_query(sql, con=engine_ch)
atc_levels = atc_levels.merge(df_md_info, left_on = 'molregno', right_on = 'salt_molregno', how = 'left')
atc_levels = atc_levels[['parent_molregno', 'level1', 'level1_description']].drop_duplicates()
atc_levels["l1_full"] = atc_levels["level1"] + "_" + atc_levels["level1_description"]
atc_levels

Unnamed: 0,parent_molregno,level1,level1_description,l1_full
0,454514,J,ANTIINFECTIVES FOR SYSTEMIC USE,J_ANTIINFECTIVES FOR SYSTEMIC USE
1,675285,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
2,88739,N,NERVOUS SYSTEM,N_NERVOUS SYSTEM
3,1379143,B,BLOOD AND BLOOD FORMING ORGANS,B_BLOOD AND BLOOD FORMING ORGANS
4,1152067,P,"ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPELLENTS","P_ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPELLENTS"
...,...,...,...,...
4326,1376478,N,NERVOUS SYSTEM,N_NERVOUS SYSTEM
4327,366274,S,SENSORY ORGANS,S_SENSORY ORGANS
4328,229629,L,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
4329,85649,N,NERVOUS SYSTEM,N_NERVOUS SYSTEM


In [44]:
between_str_join = ' | '
atc_levels['atc_level1'] = atc_levels.groupby(['parent_molregno'])['l1_full'].transform(lambda x: between_str_join.join(sorted(x)))
atc_levels = atc_levels[['parent_molregno', 'atc_level1']].drop_duplicates()
atc_levels

Unnamed: 0,parent_molregno,atc_level1
0,454514,J_ANTIINFECTIVES FOR SYSTEMIC USE
1,675285,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
2,88739,N_NERVOUS SYSTEM
3,1379143,B_BLOOD AND BLOOD FORMING ORGANS
4,1152067,"P_ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPELLENTS"
...,...,...
4326,1376478,N_NERVOUS SYSTEM
4327,366274,S_SENSORY ORGANS
4328,229629,L_ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS
4329,85649,N_NERVOUS SYSTEM


In [45]:
df_combined = df_combined.merge(atc_levels, on='parent_molregno', how = 'left')

# Add scaffold smiles

In [46]:
# note: this takes a few minutes to calculate for all molecules
def calculate_scaffolds(smiles_set):
    scaffolds_dict = dict()
    scaffolds_no_stereo_dict = dict()
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        if Chem.rdMolDescriptors.CalcNumRings(mol) == 0:
            continue

        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_dict[smiles] = Chem.MolToSmiles(scaffold)
        
        # repeat after removing stereochemistry
        Chem.RemoveStereochemistry(mol)
        scaffold_no_stereo = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_no_stereo_dict[smiles] = Chem.MolToSmiles(scaffold_no_stereo)
        
    return scaffolds_dict, scaffolds_no_stereo_dict

In [47]:
# smiles_set = set(df_combined["canonical_smiles"])
# scaffolds_dict, scaffolds_no_stereo_dict = calculate_scaffolds(smiles_set)

# df_combined['scaffold_wo_stereo'] = df_combined['canonical_smiles'].map(scaffolds_no_stereo_dict)
# df_combined["scaffold_w_stereo"] = df_combined['canonical_smiles'].map(scaffolds_dict)

# Add target class annotations

In [48]:
sql_pcs = '''
SELECT distinct td.tid, pc.*, pfc.*
FROM target_dictionary td, target_components tc, component_sequences cs, component_class cc, protein_classification pc
INNER JOIN protein_family_classification pfc 
    on  pc.protein_class_id = pfc.protein_class_id
WHERE td.tid = tc.tid
    and tc.component_id = cs.component_id
    and cs.component_id = cc.component_id
    and cc.protein_class_id = pc.protein_class_id
'''

df_target_classes = pd.read_sql_query(sql_pcs, con=engine_ch)
# only interested in the tids that are in the current dataset
current_tids = set(df_combined['tid'])
df_target_classes = df_target_classes[df_target_classes['tid'].isin(current_tids)]
df_target_classes

Unnamed: 0,tid,protein_class_id,parent_id,pref_name,short_name,protein_class_desc,definition,class_level,protein_class_id.1,protein_class_desc.1,l1,l2,l3,l4,l5,l6,l7,l8
0,1,646,1,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond,2,646,enzyme hydrolase,Enzyme,Hydrolase,,,,,,
1,2,1133,1104,ABCC subfamily,MRP,transporter ntpase atp binding cassette mrp,A sequence-related subfamily of ATP-BINDING CASSETTE TRANSPORTERS that actively transport organi...,4,1133,transporter ntpase atp binding cassette mrp,Transporter,Primary active transporter,ATP-binding cassette,ABCC subfamily,,,,
2,3,104,1065,Phosphodiesterase 5A,PDE_5A,enzyme phosphodiesterase pde_5 pde_5a,,4,104,enzyme phosphodiesterase pde_5 pde_5a,Enzyme,Phosphodiesterase,Phosphodiesterase 5,Phosphodiesterase 5A,,,,
3,4,1583,1019,Voltage-gated calcium channel,VG CA,ion channel vgc vg ca,Voltage-dependent cell membrane glycoproteins selectively permeable to calcium ions. They are ca...,3,1583,ion channel vgc vg ca,Ion channel,Voltage-gated ion channel,Voltage-gated calcium channel,,,,,
5,6,10,1,Oxidoreductase,Reductase,enzyme reductase,The class of all enzymes catalyzing oxidoreduction reactions. The substrate that is oxidized is ...,2,10,enzyme reductase,Enzyme,Oxidoreductase,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8425,117043,404,1693,TKL protein kinase STKR Type 1 subfamily,Type1,enzyme kinase protein kinase tkl stkr type1,,6,404,enzyme kinase protein kinase tkl stkr type1,Enzyme,Kinase,Protein Kinase,TKL protein kinase group,TKL protein kinase STKR family,TKL protein kinase STKR Type 1 subfamily,,
8426,117043,601,0,Unclassified protein,Unclassified,unclassified,,1,601,unclassified,Unclassified protein,,,,,,,
8677,117219,601,0,Unclassified protein,Unclassified,unclassified,,1,601,unclassified,Unclassified protein,,,,,,,
8760,117303,10,1,Oxidoreductase,Reductase,enzyme reductase,The class of all enzymes catalyzing oxidoreduction reactions. The substrate that is oxidized is ...,2,10,enzyme reductase,Enzyme,Oxidoreductase,,,,,,


In [49]:
level = 'l1'
empty_str_replacement = '_'
between_str_join = '|'
target_classes_level = df_target_classes[['tid', level]].drop_duplicates().dropna()
# remove 'Unclassified protein' from targets with more than one target class, level 1
more_than_one = target_classes_level.groupby(['tid'])[level].count()
target_classes_level = target_classes_level[
    (target_classes_level['tid'].isin(more_than_one[more_than_one == 1].index.tolist())) 
    | ((target_classes_level['tid'].isin(more_than_one[more_than_one > 1].index.tolist())) 
       & (target_classes_level['l1'] != 'Unclassified protein'))]

target_classes_level['l1_desc'] = target_classes_level.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted([y.replace(' ', empty_str_replacement) for y in x])))
target_classes_level = target_classes_level[['tid', 'l1_desc']].drop_duplicates()

df_combined = df_combined.merge(target_classes_level, on='tid', how = 'left')

In [50]:
level = 'l2'
target_classes_level = df_target_classes[['tid', level]].drop_duplicates().dropna()
target_classes_level['l2_desc'] = target_classes_level.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted([y.replace(' ', empty_str_replacement) for y in x])))
target_classes_level = target_classes_level[['tid', 'l2_desc']].drop_duplicates()

df_combined = df_combined.merge(target_classes_level, on='tid', how = 'left')

In [51]:
############### TESTING: which l1 proteins have more than one target family assigned to them? ###############
test = df_combined[(df_combined['l1_desc'].str.contains('|', regex=False))][['tid', 'target_pref_name', 'target_type', 'l1_desc', 'l2_desc']].drop_duplicates()
print("#Instances with problems:", len(test))
test

#Instances with problems: 20


Unnamed: 0,tid,target_pref_name,target_type,l1_desc,l2_desc
718,104295,Cyclin-dependent kinase 4/cyclin D1,PROTEIN COMPLEX,Enzyme|Other_cytosolic_protein,Kinase
10383,104811,Bcr/Abl fusion protein,CHIMERIC PROTEIN,Enzyme|Other_cytosolic_protein,Kinase
10463,100128,Breakpoint cluster region protein,SINGLE PROTEIN,Enzyme|Other_cytosolic_protein,Kinase
11602,104841,Serotonin (5-HT) receptor,PROTEIN FAMILY,Ion_channel|Membrane_receptor,Family_A_G_protein-coupled_receptor|Ligand-gated_ion_channel
12255,104737,Sulfonylurea receptors; K-ATP channels,PROTEIN COMPLEX GROUP,Ion_channel|Transporter,Primary_active_transporter|Voltage-gated_ion_channel
12800,104717,Gamma-secretase,PROTEIN COMPLEX,Enzyme|Ion_channel,Other_ion_channel|Protease
13939,106197,26S proteasome,PROTEIN COMPLEX,Enzyme|Other_cytosolic_protein,Protease
15600,104758,Potassium-transporting ATPase,PROTEIN COMPLEX,Enzyme|Transporter,Hydrolase|Primary_active_transporter
15986,104782,"Sulfonylurea receptor 2, Kir6.2",PROTEIN COMPLEX,Ion_channel|Transporter,Primary_active_transporter|Voltage-gated_ion_channel
18079,105734,Voltage-gated calcium channel,PROTEIN COMPLEX GROUP,Auxiliary_transport_protein|Ion_channel,Calcium_channel_auxiliary_subunit_alpha2delta_family|Calcium_channel_auxiliary_subunit_beta_fami...


In [52]:
############### TESTING: which l2 proteins have more than one target family assigned to them? ###############
df_combined_test = df_combined[~(df_combined['l2_desc'].isnull())]
test = df_combined_test[(df_combined_test['l2_desc'].str.contains('|', regex=False))][['tid', 'target_pref_name', 'target_type', 'l1_desc', 'l2_desc']].drop_duplicates()
print("#Instances with problems:", len(test))
test

#Instances with problems: 10


Unnamed: 0,tid,target_pref_name,target_type,l1_desc,l2_desc
11602,104841,Serotonin (5-HT) receptor,PROTEIN FAMILY,Ion_channel|Membrane_receptor,Family_A_G_protein-coupled_receptor|Ligand-gated_ion_channel
12255,104737,Sulfonylurea receptors; K-ATP channels,PROTEIN COMPLEX GROUP,Ion_channel|Transporter,Primary_active_transporter|Voltage-gated_ion_channel
12800,104717,Gamma-secretase,PROTEIN COMPLEX,Enzyme|Ion_channel,Other_ion_channel|Protease
15600,104758,Potassium-transporting ATPase,PROTEIN COMPLEX,Enzyme|Transporter,Hydrolase|Primary_active_transporter
15986,104782,"Sulfonylurea receptor 2, Kir6.2",PROTEIN COMPLEX,Ion_channel|Transporter,Primary_active_transporter|Voltage-gated_ion_channel
18079,105734,Voltage-gated calcium channel,PROTEIN COMPLEX GROUP,Auxiliary_transport_protein|Ion_channel,Calcium_channel_auxiliary_subunit_alpha2delta_family|Calcium_channel_auxiliary_subunit_beta_fami...
18294,104770,Sodium/potassium-transporting ATPase,PROTEIN COMPLEX GROUP,Enzyme|Ion_channel|Transporter,Hydrolase|Other_ion_channel|Primary_active_transporter
19073,29,Sodium/potassium-transporting ATPase alpha-1 chain,SINGLE PROTEIN,Enzyme|Transporter,Hydrolase|Primary_active_transporter
35115,104852,"Sulfonylurea receptor 1, Kir6.2",PROTEIN COMPLEX,Ion_channel|Transporter,Primary_active_transporter|Voltage-gated_ion_channel
651570,322,DNA (cytosine-5)-methyltransferase 3A,SINGLE PROTEIN,Epigenetic_regulator,Reader|Writer


In [53]:
############### TESTING: are all Transcription_factors also Nuclear_receptors? -> yes ###############
df_combined[df_combined['l1_desc'] == 'Transcription_factor'][['tid', 'l1_desc', 'l2_desc']].drop_duplicates()

Unnamed: 0,tid,l1_desc,l2_desc
63,275,Transcription_factor,Nuclear_receptor
64,277,Transcription_factor,Nuclear_receptor
65,266,Transcription_factor,Nuclear_receptor
66,57,Transcription_factor,Nuclear_receptor
67,270,Transcription_factor,Nuclear_receptor
68,133,Transcription_factor,Nuclear_receptor
207,271,Transcription_factor,Nuclear_receptor
244,36,Transcription_factor,Nuclear_receptor
245,56,Transcription_factor,Nuclear_receptor
246,25,Transcription_factor,Nuclear_receptor


In [54]:
############### TESTING: which l1 descriptions are there? ###############
set(df_combined['l1_desc'])

{'Adhesion',
 'Adhesion|Membrane_receptor',
 'Adhesion|Membrane_receptor|Surface_antigen',
 'Auxiliary_transport_protein',
 'Auxiliary_transport_protein|Ion_channel',
 'Enzyme',
 'Enzyme|Ion_channel',
 'Enzyme|Ion_channel|Transporter',
 'Enzyme|Other_cytosolic_protein',
 'Enzyme|Transporter',
 'Epigenetic_regulator',
 'Ion_channel',
 'Ion_channel|Membrane_receptor',
 'Ion_channel|Transporter',
 'Membrane_receptor',
 'Membrane_receptor|Other_membrane_protein',
 'Membrane_receptor|Secreted_protein',
 'Membrane_receptor|Surface_antigen',
 'Other_cytosolic_protein',
 'Other_membrane_protein',
 'Secreted_protein',
 'Structural_protein',
 'Surface_antigen',
 'Transcription_factor',
 'Transporter',
 'Unclassified_protein'}

In [55]:
############### TESTING: which l2 descriptions are there? ###############
set(df_combined['l2_desc'])

{'Calcium_channel_auxiliary_subunit_alpha2delta_family',
 'Calcium_channel_auxiliary_subunit_alpha2delta_family|Calcium_channel_auxiliary_subunit_beta_family|Calcium_channel_auxiliary_subunit_gamma_family|Voltage-gated_ion_channel',
 'Cytochrome_P450',
 'Electrochemical_transporter',
 'Eraser',
 'Family_A_G_protein-coupled_receptor',
 'Family_A_G_protein-coupled_receptor|Ligand-gated_ion_channel',
 'Family_B_G_protein-coupled_receptor',
 'Family_C_G_protein-coupled_receptor',
 'Frizzled_family_G_protein-coupled_receptor',
 'Group_translocator',
 'Hydrolase',
 'Hydrolase|Other_ion_channel|Primary_active_transporter',
 'Hydrolase|Primary_active_transporter',
 'Isomerase',
 'Kinase',
 'Ligand-gated_ion_channel',
 'Ligase',
 'Lyase',
 'Nuclear_receptor',
 'Other_ion_channel',
 'Other_ion_channel|Protease',
 'Oxidoreductase',
 'Phosphatase',
 'Phosphodiesterase',
 'Primary_active_transporter',
 'Primary_active_transporter|Voltage-gated_ion_channel',
 'Protease',
 'Reader|Writer',
 'Toll-lik

# Filter for targets (all assay types) with at least 100 compounds per target

## At least 100 compounds per target

In [56]:
# consider all assay types
df_combined_all_assays = df_combined[(df_combined['only_binding'] == False)]

In [57]:
min_nof_cpds = 100

comparator_counts = df_combined_all_assays[~df_combined_all_assays['pchembl_value_mean'].isnull()].groupby(['tid_mutation'])['parent_molregno'].count()
targets_w_enough_cpds = comparator_counts[comparator_counts >= min_nof_cpds].index.tolist()
df_combined_all_assays_100 = df_combined_all_assays.query('tid_mutation in @targets_w_enough_cpds')

In [58]:
# df_combined_all_assays.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays.csv", sep = ";", index = False)
# df_combined_all_assays_100.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100.csv", sep = ";", index = False)

# df_combined_all_assays.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays.xlsx", index = False)
# df_combined_all_assays_100.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100.xlsx", index = False)

## Only drug targets 

In [59]:
d_dt_targets = set(df_combined_all_assays_100[df_combined_all_assays_100['DTI'] == 'D_DT'].tid_mutation.to_list())
df_d_dt_targets = df_combined_all_assays_100.query('tid_mutation in @d_dt_targets')

In [60]:
# df_d_dt_targets.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100_d_dt.csv", sep = ";", index = False)
# df_d_dt_targets.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100_d_dt.xlsx", index = False)

In [61]:
############### TESTING: all assay types ###############
add_dataset_sizes(df_combined_all_assays, "all assays")
add_dataset_sizes(df_combined_all_assays_100, "all, >= 100")
add_dataset_sizes(df_d_dt_targets, "all, >= 100, d_dt")

In [62]:
############### TESTING ###############
print("{:53} {}".format("Targets with >= 100 comparators:", len(targets_w_enough_cpds)))  
print("{:53} {}\n".format("Targets with >= 100 comparators and d_dt assignment:", len(d_dt_targets)))

for max_phase in range(1, 5):
    print("Phase: ", max_phase)
    df_phase = df_combined_all_assays_100[df_combined_all_assays_100['max_phase'] == max_phase]
    df_d_dt_targets_with_phase = df_d_dt_targets[df_d_dt_targets['max_phase'] == max_phase]
    print("{:50} {}: {}".format("#Targets with annotated cpds in max_phase", max_phase, df_phase.tid_mutation.nunique()))
    print("{:50} {}: {}\n".format("#D_DT targets with annotated cpds in max_phase", max_phase, df_d_dt_targets_with_phase.tid_mutation.nunique()))

Targets with >= 100 comparators:                      567
Targets with >= 100 comparators and d_dt assignment:  309

Phase:  1
#Targets with annotated cpds in max_phase          1: 329
#D_DT targets with annotated cpds in max_phase     1: 189

Phase:  2
#Targets with annotated cpds in max_phase          2: 437
#D_DT targets with annotated cpds in max_phase     2: 258

Phase:  3
#Targets with annotated cpds in max_phase          3: 430
#D_DT targets with annotated cpds in max_phase     3: 254

Phase:  4
#Targets with annotated cpds in max_phase          4: 502
#D_DT targets with annotated cpds in max_phase     4: 309



# Filter for targets (only binding assays) with at least 100 comparator compounds

## At least 100 compounds per target

In [63]:
# consider only binding assays and therapeutic targets
df_comb_B = df_combined[(df_combined['only_binding'] == True)]

In [64]:
min_nof_cpds = 100

comparator_counts_B = df_comb_B[~df_comb_B['pchembl_value_mean'].isnull()].groupby(['tid_mutation'])['parent_molregno'].count()
targets_w_enough_cpds_B = comparator_counts_B[comparator_counts_B >= min_nof_cpds].index.tolist()
df_comb_B_100 = df_comb_B.query('tid_mutation in @targets_w_enough_cpds_B')

In [65]:
# df_comb_B.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays.csv", sep = ";", index = False)
# df_comb_B_100.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100.csv", sep = ";", index = False)

# df_comb_B.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays.xlsx", index = False)
# df_comb_B_100.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100.xlsx", index = False)

## Only drug targets 

In [66]:
d_dt_targets_B = set(df_comb_B_100[df_comb_B_100['DTI'] == 'D_DT'].tid_mutation.to_list())
df_d_dt_targets_B = df_comb_B_100.query('tid_mutation in @d_dt_targets_B')

In [67]:
# df_d_dt_targets_B.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100_d_dt.csv", sep = ";", index = False)
# df_d_dt_targets_B.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100_d_dt.xlsx", index = False)

In [68]:
############### TESTING: binding assays ###############
add_dataset_sizes(df_comb_B, "binding")
add_dataset_sizes(df_comb_B_100, "b, >= 100")
add_dataset_sizes(df_d_dt_targets_B, "b, >= 100, d_dt")

In [69]:
############### TESTING ###############
print("{:53} {}".format("Targets with >= 100 comparators:", len(targets_w_enough_cpds_B)))  
print("{:53} {}\n".format("Targets with >= 100 comparators and d_dt assignment:", len(d_dt_targets_B)))

for max_phase in range(1, 5):
    print("Phase: ", max_phase)
    df_phase = df_comb_B_100[df_comb_B_100['max_phase'] == max_phase]
    df_d_dt_targets_with_phase = df_d_dt_targets_B[df_d_dt_targets_B['max_phase'] == max_phase]
    print("{:50} {}: {}".format("#Targets with annotated cpds in max_phase", max_phase, df_phase.tid_mutation.nunique()))
    print("{:50} {}: {}\n".format("#D_DT targets with annotated cpds in max_phase", max_phase, df_d_dt_targets_with_phase.tid_mutation.nunique()))

Targets with >= 100 comparators:                      539
Targets with >= 100 comparators and d_dt assignment:  296

Phase:  1
#Targets with annotated cpds in max_phase          1: 311
#D_DT targets with annotated cpds in max_phase     1: 177

Phase:  2
#Targets with annotated cpds in max_phase          2: 415
#D_DT targets with annotated cpds in max_phase     2: 247

Phase:  3
#Targets with annotated cpds in max_phase          3: 402
#D_DT targets with annotated cpds in max_phase     3: 240

Phase:  4
#Targets with annotated cpds in max_phase          4: 471
#D_DT targets with annotated cpds in max_phase     4: 296



# Overview of comparisons to old ChEMBL26 dataset

In [70]:
############### TESTING: development of size(curr_data) ###############
print("Size(curr_data)")
pd.DataFrame(all_length_comp,
                   columns=['type', 'mols', 'drugs', 'targets', 'drug_targets', 'cpd_target', 'drug_target'])

Size(curr_data)


Unnamed: 0,type,mols,drugs,targets,drug_targets,cpd_target,drug_target
0,init,1098855,1754,8312,3360,2579716,29759
1,pre DTI,921555,752,7410,588,2085879,1628
2,post DTI,477626,752,1472,588,723010,1628
3,cpd props,477185,750,1472,588,722308,1626
4,all assays,477185,750,1472,588,722308,1626
5,"all, >= 100",470770,687,567,309,707355,1226
6,"all, >= 100, d_dt",292793,687,309,309,439024,1226
7,binding,384806,716,1424,573,579084,1559
8,"b, >= 100",377893,656,539,296,563611,1166
9,"b, >= 100, d_dt",245278,656,296,296,371542,1166


In [71]:
############### TESTING: development of size(curr_data) with pchembl data ###############
print("Size(curr_data) with pchembl data")
pd.DataFrame(all_length_comp_d_dt_pchembl,
                   columns=['type', 'mols', 'drugs', 'targets', 'drug_targets', 'cpd_target', 'drug_target'])

Size(curr_data) with pchembl data


Unnamed: 0,type,mols,drugs,targets,drug_targets,cpd_target,drug_target
0,init,923862,1637,6893,2569,2094029,21601
1,pre DTI,921348,696,6843,484,2074104,1420
2,post DTI,477417,696,1368,484,718807,1420
3,cpd props,476976,694,1368,484,718105,1418
4,all assays,476976,694,1368,484,718105,1418
5,"all, >= 100",470582,653,567,296,704036,1155
6,"all, >= 100, d_dt",292577,653,309,296,437090,1155
7,binding,384577,660,1327,471,575043,1362
8,"b, >= 100",377682,617,539,283,560469,1098
9,"b, >= 100, d_dt",245071,617,296,283,369782,1098
