# Notebook to extract and curate ChEMBL data for the Leeson data set (drug-target interactions)

### Authors: Barbara Zdrazil, Lina Heinzke
### 10/2022

**This notebook extracts data from ChEMBL and performs some curation steps in order to retrieve a data set for drug-target, and clinical candidate-target associations including comparator compounds.**

**The notebook is based on initial work by Anne Hersey, Patrica Bento, Emma Manners, Paul Leeson, and Andrew Leach.**

**More documentation on the initial data set compilation can be found here ("Ligand Efficiency"): https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?spaceKey=CHEMBL&title=Anne%27s+Notes**


In [1]:
import pandas as pd
import numpy as np
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools

#### notebook settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

# Get data from ChEMBL

In [2]:
# # @Barbara: uncomment and modify this to your preferred paths
# path_results = "/Users/bzdrazil/Dropbox/ChEMBL/NP/data/"
# path_sqlite3_database = <your sqlite database location>

chembl_version = "26"
base_path = "/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/"
path_results = base_path+"results/"
path_sqlite3_database = base_path+"data/chembl_"+chembl_version+"/chembl_"+chembl_version+"_sqlite/chembl_"+chembl_version+".db"

In [3]:
# # @Barbara: Accessing ChEMBL using Oracle
# # If you want to use this option, you have to change the sql statements to the commented line:
# # from docs -> from chembl_31.docs

# import cx_Oracle

# #cx_Oracle.init_oracle_client(lib_dir="/Users/bzdrazil/Downloads/instantclient_19_8")  #https://www.oracle.com/in/database/technologies/instant-client/macos-intel-x86-downloads.html; https://stackoverflow.com/questions/56119490/cx-oracle-error-dpi-1047-cannot-locate-a-64-bit-oracle-client-library
# cx_Oracle.clientversion() 

# import pandas as pd
# import sqlalchemy as sa
# import requests

# workdir = '/Users/bzdrazil/Desktop/'

# chemdev2 = 'oracle://{}:{}@ora-dlvm-103.ebi.ac.uk:1521/?service_name=chemdev2'.format('user', 'pw') # insert your username and password
# engine_ch = sa.create_engine(chemdev2)

In [4]:
import sqlite3

engine_ch = sqlite3.connect(path_sqlite3_database)

In [5]:
sql = '''
SELECT act.molregno, act.pchembl_value, act.standard_type, 
    ass.assay_type, ass.tid, 
    vs.mutation,
    td.pref_name as target_pref_name, td.target_type, td.organism, td.chembl_id as target_chembl_id,
    docs.year, docs.journal
FROM activities act
INNER JOIN assays ass 
    on  act.assay_id = ass.assay_id
LEFT JOIN variant_sequences vs
    on ass.variant_id = vs.variant_id
INNER JOIN target_dictionary td
    on ass.tid = td.tid
INNER JOIN docs
    on act.doc_id = docs.doc_id
WHERE act.pchembl_value is not null
    and act.potential_duplicate = 0
    and act.standard_relation = '='
    and data_validity_comment is null
    and td.tid <>22226   ----exclude unchecked targets
    and td.target_type like '%PROTEIN%'
    -- and ass.assay_type = 'B' -- only binding assays, will be taken care of later
'''

df_mols = pd.read_sql_query(sql, con=engine_ch)
df_mols['tid_mutation'] = np.where(df_mols['mutation'].notnull(), 
                                   df_mols['tid'].astype('str')+'-'+df_mols['mutation'], 
                                   df_mols['tid'].astype('str'))
df_mols

Unnamed: 0,molregno,pchembl_value,standard_type,assay_type,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,year,journal,tid_mutation
0,252199,5.40,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483
1,253534,4.77,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483
2,253199,6.75,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483
3,253199,5.22,IC50,A,12594,,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,2004.0,Bioorg. Med. Chem. Lett.,12594
4,253199,4.43,IC50,A,17045,,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,2004.0,Bioorg. Med. Chem. Lett.,17045
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475921,2317531,8.10,Ki,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522
2475922,2334349,9.22,Kd,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134
2475923,2329285,8.42,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134
2475924,2317951,7.89,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134


In [6]:
# both salt and parent needed: md.chembl_id as compound_chembl_id, md.pref_name as compound_pref_name
# extremes needed md.max_phase, md.usan_year, md.first_approval
# salt info might be sufficient: md.prodrug, md.oral, md.parenteral, md.topical, md.black_box_warning

q = """
    SELECT DISTINCT md2.molregno as parent_molregno, md2.chembl_id as parent_chemblid, md2.pref_name as parent_pref_name
    , md.molregno as salt_molregno, md.chembl_id as salt_chemblid, md.pref_name as salt_pref_name
    , md.max_phase as salt_max_phase, md.usan_year as salt_usan_year, md.first_approval as salt_first_approval
    , md.prodrug, md.oral, md.parenteral, md.topical, md.black_box_warning

    --First join parent_cmpds and salt_cmpds (ie their children)
    FROM molecule_dictionary md                                        --salt_molregno
    JOIN molecule_hierarchy mh ON mh.molregno = md.molregno
    JOIN molecule_dictionary md2 ON mh.parent_molregno = md2.molregno  --parent_molregno
    
    -- Now set relevant conditions: 
    -- WHERE md.max_phase = '4'
    --AND md2.chembl_id = 'CHEMBL192' --Test using Sildenafil Citrate 

    """
df_md_info = pd.read_sql_query(q, engine_ch)
df_md_info['max_phase'] = df_md_info.groupby(['parent_molregno'])['salt_max_phase'].transform('max')
df_md_info['usan_year'] = df_md_info.groupby(['parent_molregno'])['salt_usan_year'].transform('min')
df_md_info['first_approval'] = df_md_info.groupby(['parent_molregno'])['salt_first_approval'].transform('min')
df_md_info = df_md_info.drop(columns=['salt_max_phase', 'salt_usan_year', 'salt_first_approval'])

In [7]:
df_mols = df_mols.merge(df_md_info, left_on = 'molregno', right_on = 'salt_molregno', how = 'left')
df_mols

Unnamed: 0,molregno,pchembl_value,standard_type,assay_type,tid,mutation,target_pref_name,target_type,organism,target_chembl_id,year,journal,tid_mutation,parent_molregno,parent_chemblid,parent_pref_name,salt_molregno,salt_chemblid,salt_pref_name,prodrug,oral,parenteral,topical,black_box_warning,max_phase,usan_year,first_approval
0,252199,5.40,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,252199,CHEMBL357278,,252199,CHEMBL357278,,-1,0,0,0,0,0,,
1,253534,4.77,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,253534,CHEMBL357119,,253534,CHEMBL357119,,-1,0,0,0,0,0,,
2,253199,6.75,IC50,B,10483,,Palmitoyl-CoA oxidase,SINGLE PROTEIN,Rattus norvegicus,CHEMBL4632,2004.0,Bioorg. Med. Chem. Lett.,10483,253199,CHEMBL152968,,253199,CHEMBL152968,,-1,0,0,0,0,0,,
3,253199,5.22,IC50,A,12594,,Cytochrome P450 1A2,SINGLE PROTEIN,Homo sapiens,CHEMBL3356,2004.0,Bioorg. Med. Chem. Lett.,12594,253199,CHEMBL152968,,253199,CHEMBL152968,,-1,0,0,0,0,0,,
4,253199,4.43,IC50,A,17045,,Cytochrome P450 3A4,SINGLE PROTEIN,Homo sapiens,CHEMBL340,2004.0,Bioorg. Med. Chem. Lett.,17045,253199,CHEMBL152968,,253199,CHEMBL152968,,-1,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475921,2317531,8.10,Ki,B,11522,,Cholecystokinin B receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL298,2018.0,J Med Chem,11522,2317531,CHEMBL4278080,,2317531,CHEMBL4278080,,-1,0,0,0,0,0,,
2475922,2334349,9.22,Kd,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134,2334349,CHEMBL4294901,,2334349,CHEMBL4294901,,-1,0,0,0,0,0,,
2475923,2329285,8.42,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134,2329285,CHEMBL4289837,,2329285,CHEMBL4289837,,-1,0,0,0,0,0,,
2475924,2317951,7.89,Ki,B,134,,Vasopressin V1a receptor,SINGLE PROTEIN,Homo sapiens,CHEMBL1889,2018.0,J Med Chem,134,2317951,CHEMBL4278500,,2317951,CHEMBL4278500,,-1,0,0,0,0,0,,


In [8]:
df_mols = df_mols.astype({
    'year': 'Int64',
    'first_approval': 'Int64'
})

In [9]:
# df_mols.to_csv(path_results+"ChEMBL"+chembl_version+"_initial_query.csv", sep = ';', index = False)

In [10]:
############### TESTING: load comparison data set ###############
def get_sets(data):
    comp_mols = set(data["PARENT_MOLREGNO"]) 
    comp_drugs = set(data[data['DTI'] == 'D_DT']["PARENT_MOLREGNO"])
    comp_targets = set(data["TID_mutation"]) 
    comp_drug_targets = set(data[data['DTI'] == 'D_DT']["TID_mutation"]) 
    comp_pairs = set(data["PAIRS"])
    comp_drug_pairs = set(data[data['DTI'] == 'D_DT']["PAIRS"])
    return comp_mols, comp_drugs, comp_targets, comp_drug_targets, comp_pairs, comp_drug_pairs

# full comparison dataset
df_comparison = pd.read_csv(base_path+"data/DTI_2021_final.dsv", sep = "\t")
# map PARENT_MOLREGNO based on salt info table used in the current pipeline
df_comparison = df_comparison.merge(df_md_info, left_on = 'PARENT_MOLREGNO', right_on = 'salt_molregno', how = 'left')
df_comparison = df_comparison.rename(columns={"PARENT_MOLREGNO": "PARENT_MOLREGNO_OLD"}, errors="raise")
df_comparison = df_comparison.rename(columns={"parent_molregno": "PARENT_MOLREGNO"}, errors="raise")
df_comparison = df_comparison.astype({'PARENT_MOLREGNO': 'Int64'})
# add parent_molregno for cases where there is no info in df_md_info
df_comparison.loc[(df_comparison['PARENT_MOLREGNO'].isnull()), 'PARENT_MOLREGNO'] = df_comparison['PARENT_MOLREGNO_OLD']

df_comparison[['TID_no_ending', 'ending']] = df_comparison['TID'].str.split('-', 1, expand=True)
df_comparison['TID_mutation'] = np.where(((df_comparison['ending'].notnull()) & (df_comparison['ending'] != 'WT')), 
                                   df_comparison['TID_no_ending']+'-'+df_comparison['ending'], 
                                   df_comparison['TID_no_ending'])
df_comparison['PAIRS'] = df_comparison.agg('{0[PARENT_MOLREGNO]}_{0[TID_mutation]}'.format, axis=1)
df_comparison = df_comparison.astype({'TID_no_ending': 'Int64'})

comp_mols, comp_drugs, comp_targets, comp_drug_targets, comp_pairs, comp_drug_pairs = get_sets(df_comparison)

comparison_list = ["comparison", 
                   len(comp_mols), len(comp_drugs), 
                   len(comp_targets), len(comp_drug_targets), len(comp_pairs), len(comp_drug_pairs)]
all_intersect_comp = [comparison_list]
all_length_comp = [comparison_list]

# limit the dataset to the targets that have at least 100 compounds (including drugs)
min_nof_cpds = 100
comparator_cpds_100 = df_comparison.groupby(['TID'])['PARENT_MOLREGNO'].count()
targets_enough_cpds = comparator_cpds_100[comparator_cpds_100 >= min_nof_cpds].index.tolist()
df_comparison_100 = df_comparison.query('TID in @targets_enough_cpds')

comp_mols_100, comp_drugs_100, comp_targets_100, comp_drug_targets_100, comp_pairs_100, comp_drug_pairs_100 = get_sets(df_comparison_100)

comparison_list_100 = ["comparison_100", 
                       len(comp_mols_100), len(comp_drugs_100), 
                       len(comp_targets_100), len(comp_drug_targets_100), len(comp_pairs_100), len(comp_drug_pairs_100)]
all_intersect_comp_100 = [comparison_list_100]
all_length_comp.append(comparison_list_100)



comp_d_dt_targets = set(df_comparison_100[df_comparison_100['DTI'] == 'D_DT'].TID.to_list())
comp_d_dt = df_comparison_100.query('TID in @comp_d_dt_targets')
comp_mols_d_dt, comp_drugs_d_dt, comp_targets_d_dt, comp_drug_targets_d_dt, comp_pairs_d_dt, comp_drug_pairs_d_dt = get_sets(comp_d_dt)
comparison_list_d_dt = ["comparison_d_dt", len(comp_mols_d_dt), len(comp_drugs_d_dt), 
      len(comp_targets_d_dt), len(comp_drug_targets_d_dt), len(comp_pairs_d_dt), len(comp_drug_pairs_d_dt)]
all_intersect_comp_d_dt = [comparison_list_d_dt]
all_length_comp.append(comparison_list_d_dt)

In [11]:
############### TESTING: method to print comparison to original dataset ###############
def print_comparison(description, now, comp, output=False):
    intersect = now.intersection(comp)
    if output:
        print("------")
        print(description)
        print("{:15}".format("current:"), len(now))
        print("{:15}".format("comparison:"), len(comp))
        print("{:15}".format("intersection:"), len(intersect))
    return len(intersect)


def print_all_comparisons(now_mols, now_drugs, now_targets, now_drug_targets, now_pairs, now_drug_pairs, 
                          output=False, comp_type="base"):
    intersections = []
    descriptions = ["Mols (molregno)", "Drugs (molregno)", "Targets (tid)", "Drug targets (tid)", "Compound-target pairs",  "Drug-target pairs"]
    now_results = [now_mols, now_drugs, now_targets, now_drug_targets, now_pairs, now_drug_pairs]
    if comp_type == "base":
        comparison_results = [comp_mols, comp_drugs, comp_targets, comp_drug_targets, comp_pairs, comp_drug_pairs]
    elif comp_type == "only_100":
        comparison_results = [comp_mols_100, comp_drugs_100, comp_targets_100, comp_drug_targets_100, comp_pairs_100, comp_drug_pairs_100]
    elif comp_type == "d_dt":
        comparison_results = [comp_mols_d_dt, comp_drugs_d_dt, comp_targets_d_dt, comp_drug_targets_d_dt, comp_pairs_d_dt, comp_drug_pairs_d_dt]
    
    for description, now, comp in zip(descriptions, now_results, comparison_results):
        intersections.append(print_comparison(description, now, comp, output))
    return intersections


def add_intersections(data, label, output=False):
    now_mols = set(data["parent_molregno"]) 
    now_drugs = set(data[data["max_phase"] == 4]["parent_molregno"]) 
    now_targets = set(data["tid_mutation"]) 
    now_drug_targets = set(data[data["max_phase"] == 4]["tid_mutation"]) 
    now_pairs = set(data.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1))
    now_drug_pairs = set(data[data["max_phase"] == 4].agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1))
    
    comp = print_all_comparisons(now_mols, now_drugs, now_targets, now_drug_targets, now_pairs, now_drug_pairs, output, comp_type="base")
    all_intersect_comp.append([label] + comp)
    all_length_comp.append([label, len(now_mols), len(now_drugs), len(now_targets), len(now_drug_targets), len(now_pairs), len(now_drug_pairs)])

    if output:
        print("..................... >= 100 cpds .....................")
    comp_100 = print_all_comparisons(now_mols, now_drugs, now_targets, now_drug_targets, now_pairs, now_drug_pairs, output, comp_type="only_100")
    all_intersect_comp_100.append([label] + comp_100)
    
    if output:
        print("..................... d_dt cpds .....................")
    comp_d_dt = print_all_comparisons(now_mols, now_drugs, now_targets, now_drug_targets, now_pairs, now_drug_pairs, output, comp_type="d_dt")
    all_intersect_comp_d_dt.append([label] + comp_d_dt)

In [12]:
############### TESTING: initial query ###############
add_intersections(df_mols, "init", True)

------
Mols (molregno)
current:        923862
comparison:     378661
intersection:   378661
------
Drugs (molregno)
current:        1637
comparison:     686
intersection:   616
------
Targets (tid)
current:        6893
comparison:     910
intersection:   907
------
Drug targets (tid)
current:        2569
comparison:     351
intersection:   349
------
Compound-target pairs
current:        2094029
comparison:     561772
intersection:   560759
------
Drug-target pairs
current:        21601
comparison:     1221
intersection:   1120
..................... >= 100 cpds .....................
------
Mols (molregno)
current:        923862
comparison:     372452
intersection:   372452
------
Drugs (molregno)
current:        1637
comparison:     643
intersection:   573
------
Targets (tid)
current:        6893
comparison:     525
intersection:   524
------
Drug targets (tid)
current:        2569
comparison:     271
intersection:   269
------
Compound-target pairs
current:        2094029
comparison:

# Calculate mean, median, and max pChEMBL values for each compound-target pair

In [13]:
# summarise pchembl values into mean, max, median and year into first publication
df_mols['pchembl_value_mean'] = df_mols.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
df_mols['pchembl_value_max'] = df_mols.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
df_mols['pchembl_value_median'] = df_mols.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
df_mols['first_publication_target_cmpd_pair'] = df_mols.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')

In [14]:
# repeat based on the data of only the binding assays
df_mols_binding = df_mols[df_mols['assay_type'] == 'B'].copy()
df_mols_binding['pchembl_value_mean'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('mean')
df_mols_binding['pchembl_value_max'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('max')
df_mols_binding['pchembl_value_median'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['pchembl_value'].transform('median')
df_mols_binding['first_publication_target_cmpd_pair'] = df_mols_binding.groupby(['parent_molregno', 'tid_mutation'])['year'].transform('min')

In [15]:
# table that has rows for pchembl_mean, max, median based on all assay data (only_binding = False)
# and rows for values based on only binding assays (only_binding = True)
df_mols['only_binding'] = False
df_mols_binding['only_binding'] = True
df_combined = pd.concat([df_mols, df_mols_binding])
df_combined = df_combined.drop(columns=['year', 'journal', 'pchembl_value', 'standard_type', 'assay_type'])
df_combined = df_combined.drop_duplicates()

# Note that keeping salt info increases the number of entries in this table
# We need to decide what to do with these
# df_combined = df_combined.drop(columns=['salt_molregno', 'salt_chemblid',
#        'salt_pref_name', 'prodrug', 'oral',  
#        'parenteral', 'topical', 'black_box_warning'])
# df_combined = df_combined.drop_duplicates()
# df_combined

# Extract drug-target interactions with disease relevance from drug_mechanism table

In [16]:
sql_dti = '''
SELECT DISTINCT mh.parent_molregno, dm.tid, dm.disease_efficacy
FROM drug_mechanism dm
INNER JOIN molecule_hierarchy mh
    on dm.molregno = mh.molregno
INNER JOIN molecule_dictionary md
    on mh.parent_molregno = md.molregno
'''

df_dti = pd.read_sql_query(sql_dti, con=engine_ch)
# drop rows that don't have a tid
df_dti = df_dti.dropna(subset = ['tid'])
df_dti['tid'] = df_dti['tid'].astype('Int64')
df_dti

Unnamed: 0,parent_molregno,tid,disease_efficacy
0,1124,11060,1
1,675068,10193,1
2,1125,10193,1
3,1085,10193,1
4,1124,10193,1
...,...,...,...
4756,1304559,101019,1
4757,1304559,100417,1
4758,2336099,11540,1
4759,2146132,100097,1


In [17]:
# disease_efficacy NUMBER
# Flag to show whether the target assigned is believed to play a role in the efficacy of the drug in the indication(s)
# for which it is approved (1 = yes, 0 = no)
df_dti = df_dti[df_dti['disease_efficacy'] == 1].copy()

## Add single proteins for protein families, protein complexes, protein complex groups
## Query protein mappings

In [18]:
sql_dti = '''
SELECT tr.tid, tr.relationship, tr.related_tid, 
    td1.pref_name as pref_name_1, td1.target_type as target_type_1, td1.organism as organism_1, 
    td2.pref_name as pref_name_2, td2.target_type as target_type_2, td2.organism as organism_2, td2.chembl_id as Target_chembl_id_2 
FROM target_relations tr
INNER JOIN target_dictionary td1
    on tr.tid = td1.tid
INNER JOIN target_dictionary td2
    on tr.related_tid = td2.tid
'''

df_related_targets = pd.read_sql_query(sql_dti, con=engine_ch)
df_related_targets.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2,Target_chembl_id_2
0,10193,SUBSET OF,104764,Carbonic anhydrase I,SINGLE PROTEIN,Homo sapiens,Carbonic anhydrase,PROTEIN FAMILY,Homo sapiens,CHEMBL2095180
1,12071,SUBSET OF,109746,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase,PROTEIN FAMILY,Homo sapiens,CHEMBL3559691
2,12071,SUBSET OF,104709,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase 1/cyclin B,PROTEIN COMPLEX,Homo sapiens,CHEMBL2094127
3,12071,SUBSET OF,107893,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,CDK1/Cyclin A,PROTEIN COMPLEX,Homo sapiens,CHEMBL3038467
4,12071,SUBSET OF,117095,Cyclin-dependent kinase 1,SINGLE PROTEIN,Homo sapiens,Cyclin-dependent kinase 1/G1/S-specific cyclin-D1,PROTEIN COMPLEX,Homo sapiens,CHEMBL3885551


In [19]:
protein_complex_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_family_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN FAMILY") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

protein_complex_group_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN COMPLEX GROUP") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

# TODO: should these be included? which direction (how to avoid duplications)?
single_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "SINGLE PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "EQUIVALENT TO")]

# TODO: should these be included?
chimeric_protein_mapping = df_related_targets[(df_related_targets["target_type_1"] == "CHIMERIC PROTEIN") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

# TODO: should these be included?
ppi_mapping = df_related_targets[(df_related_targets["target_type_1"] == "PROTEIN-PROTEIN INTERACTION") 
                    & (df_related_targets["target_type_2"] == "SINGLE PROTEIN")
                    & (df_related_targets["relationship"] == "SUPERSET OF")]

In [20]:
relevant_mappings = pd.concat([protein_complex_mapping, 
                               protein_family_mapping, 
                               protein_complex_group_mapping,
                               #single_protein_mapping, 
                               #chimeric_protein_mapping, 
                               #ppi_mapping, 
                               ])
relevant_mappings['tid'] = relevant_mappings['tid'].astype('Int64')
relevant_mappings.head()

Unnamed: 0,tid,relationship,related_tid,pref_name_1,target_type_1,organism_1,pref_name_2,target_type_2,organism_2,Target_chembl_id_2
249,104282,SUPERSET OF,10819,Acetylcholine receptor; alpha1/beta1/delta/gamma,PROTEIN COMPLEX,Homo sapiens,Acetylcholine receptor protein alpha chain,SINGLE PROTEIN,Homo sapiens,CHEMBL4808
251,104282,SUPERSET OF,12715,Acetylcholine receptor; alpha1/beta1/delta/gamma,PROTEIN COMPLEX,Homo sapiens,Acetylcholine receptor protein delta chain,SINGLE PROTEIN,Homo sapiens,CHEMBL3011
260,104284,SUPERSET OF,10023,Neuronal acetylcholine receptor; alpha2/beta4,PROTEIN COMPLEX,Rattus norvegicus,Neuronal acetylcholine receptor protein alpha-2 subunit,SINGLE PROTEIN,Rattus norvegicus,CHEMBL2584
261,104284,SUPERSET OF,12717,Neuronal acetylcholine receptor; alpha2/beta4,PROTEIN COMPLEX,Rattus norvegicus,Neuronal acetylcholine receptor subunit beta-4,SINGLE PROTEIN,Rattus norvegicus,CHEMBL2658
275,104685,SUPERSET OF,49,Integrin alpha-IIb/beta-3,PROTEIN COMPLEX,Homo sapiens,Integrin alpha-IIb,SINGLE PROTEIN,Homo sapiens,CHEMBL212


In [21]:
# DT-interactions and targets based on drug_mechanisms table
DTIs_original = set(df_dti.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1))
dti_tids_original = set(df_dti['tid'])

# DT-interactions and targets with other target IDs based on mapping
df_dti_add_targets = df_dti.merge(relevant_mappings, on = 'tid', how = 'inner')
DTIs_mapped = set(df_dti_add_targets.agg('{0[parent_molregno]}_{0[related_tid]}'.format, axis=1))
dti_tids_mapped = set(df_dti_add_targets['related_tid'].astype("int"))

# combined DT-interactions and targets
DTIs_set = DTIs_original.union(DTIs_mapped)
dti_tids_set = dti_tids_original.union(dti_tids_mapped)

In [22]:
############### TESTING: drug_mechanisms available data comparison ###############
# Including mapped target IDs significantly improves the intersection 
# between the comparison dataset and the drug_mechanisms table.
# However, there are generally targets and drug-target pairs not present in the drug_mechanisms table
# that are present in the old dataset.
df_comparison['PAIRS_tid'] = df_comparison.agg('{0[PARENT_MOLREGNO]}_{0[TID_no_ending]}'.format, axis=1)
comp_dti_targets = set(df_comparison["TID_no_ending"]) 
comp_dti_drug_pairs = set(df_comparison[df_comparison['DTI'] == 'D_DT']["PAIRS_tid"])

dti_mols = set(df_dti['parent_molregno'])
mol_intersect = print_comparison("Drugs (molregno)", dti_mols, comp_drugs, output=True)

print('................. Before mapping target IDs .................')
dti_targets = dti_tids_original
targets_intersect = print_comparison("Targets (tid)", dti_targets, comp_dti_targets, output=True)

dti_pairs = DTIs_original 
pairs_intersect = print_comparison("Drug-target pairs", dti_pairs, comp_dti_drug_pairs, output=True)

print('................. Including mapping target IDs .................')
dti_mapped_target = dti_tids_set
targets_mapped_intersect = print_comparison("Targets (tid)", dti_mapped_target, comp_dti_targets, output=True)

dti_mapped_pairs = DTIs_set 
pairs_mapped_intersect = print_comparison("Drug-target pairs", dti_mapped_pairs, comp_dti_drug_pairs, output=True)


all_intersect_comp.append(['drug_mechanisms mapped', 0, mol_intersect, targets_mapped_intersect, 0, 0,  pairs_mapped_intersect])

------
Drugs (molregno)
current:        3528
comparison:     686
intersection:   629
................. Before mapping target IDs .................
------
Targets (tid)
current:        1080
comparison:     908
intersection:   654
------
Drug-target pairs
current:        4535
comparison:     1221
intersection:   653
................. Including mapping target IDs .................
------
Targets (tid)
current:        1369
comparison:     908
intersection:   895
------
Drug-target pairs
current:        8579
comparison:     1221
intersection:   1113


# DTI classification

Identify which TIDs are drug targets from the drug_mechanism table and add field called "defined_DTI": 
    Value: "True" if it is a drug with a curated drug_meachnism and "False" if not.

Map again to the drug mechanism table via "tid" to identify therapeutic targets and add column "therapeutic_target":
    Value: "True" | "False"

Use this logic to define per compound/target pair whether it corresponds to:
    
    drug and its therapeutic target "DTI": "D_DT";
        
    a drug target but not a drug "DTI": "DT";
        
    not a drug and not a drug target "DTI": "NDT";
        
(done by first creating a new column "DT_assoc" in both the master and the mapping table which reflects the molregno-tid association and then mapping it)

In [23]:
# drugs_set = set(df_combined[df_combined['max_phase'] == 4]["parent_molregno"])
# df_combined['is_drug'] = df_combined['parent_molregno'].isin(drugs_set)
df_combined['therapeutic_target'] = df_combined['tid'].isin(dti_tids_set)
df_combined['DT_assoc'] = df_combined.agg('{0[parent_molregno]}_{0[tid]}'.format, axis=1)

In [24]:
df_combined['DTI'] = "Nan"
df_combined.loc[df_combined['DT_assoc'].isin(DTIs_set), 'DTI'] = "D_DT"
df_combined.loc[((df_combined['therapeutic_target'] == True) & (df_combined['max_phase'] != 4)), 'DTI'] = "DT"
df_combined.loc[((df_combined['therapeutic_target'] == False) & (df_combined['max_phase'] != 4)), 'DTI'] = "NDT"

In [25]:
############### TESTING: before reducing to D_DT and DT ###############
add_intersections(df_combined, "pre DTI")

In [26]:
# keep only D_DT and DT
df_combined = df_combined[(df_combined['DTI'].isin(['D_DT', 'DT']))]

In [27]:
############### TESTING: after reducing to D_DT and DT ###############
add_intersections(df_combined, "post DTI")

In [28]:
# ############### TESTING ###############
# # This was supposedly changed to include single proteins for protein families / complexes
# df_combined[(df_combined['parent_pref_name'] == "PRAZOSIN") 
#             &(df_combined['target_pref_name'].str.contains("drenergic receptor"))  
#             & (df_combined['DTI'] == 'D_DT')
#             & (df_combined['only_binding'] == True)][['tid', 'target_pref_name', 'pchembl_value_mean', 'pchembl_value_median', 'pchembl_value_max', 'target_type']]

In [29]:
# ############### TESTING ###############
# # This potentially doesn't make sense: the max pcheml should be the same as the protein family?
# df_comparison[(df_comparison['CMPD_PREF_NAME'] == "PRAZOSIN") 
#             &(df_comparison['TARGET_PREF_NAME'].str.contains("drenergic receptor"))  
#             & (df_comparison['DTI'] == 'D_DT')][['TID', 'PCHEMBL_MEAN', 'PCHEMBL_MEDIAN', 'PCHEMBL_MAX', 'TARGET_TYPE']]

# Add compound properties

In [30]:
sql_cpd_props = '''

SELECT mh.parent_molregno, 
    cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
    cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
    cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
    cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, 
    struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
FROM compound_properties cp, molecule_hierarchy mh, compound_structures struct
-- FROM chembl_31.compound_properties cp, chembl_31.molecule_hierarchy mh, chembl_31.compound_structures struct
WHERE cp.molregno=mh.parent_molregno
    and struct.molregno=mh.parent_molregno

'''

df_cpd_props = pd.read_sql_query(sql_cpd_props, con=engine_ch)
df_cpd_props.head()

Unnamed: 0,parent_molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations,standard_inchi,standard_inchi_key,canonical_smiles
0,1,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.48,,3.63,2.69,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-11(21-17(24)20-15(22)9-19-21)6-7-12(10)16(23)13-4-2-3-5-14(13)18/h...,OWRSAHYFSSNENM-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,2,332.32,1.33,6.0,1.0,108.61,3.0,N,0.0,6.33,,2.88,1.82,ACID,332.32,3.0,25.0,0.73,332.0909,C18H12N4O3,7.0,1.0,0.0,InChI=1S/C18H12N4O3/c1-11-8-14(22-18(25)21-16(23)10-20-22)6-7-15(11)17(24)13-4-2-12(9-19)3-5-13/...,ZJYUMURGSZQFMH-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,3,357.8,2.27,5.0,2.0,87.98,3.0,N,0.0,6.33,,3.7,2.64,ACID,357.8,3.0,25.0,0.75,357.088,C18H16ClN3O3,6.0,2.0,0.0,InChI=1S/C18H16ClN3O3/c1-10-7-14(22-18(25)21-15(23)9-20-22)8-11(2)16(10)17(24)12-3-5-13(19)6-4-1...,YOMWDCALSDWFSV-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,4,307.31,1.46,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.02,1.97,ACID,307.31,3.0,23.0,0.74,307.0957,C17H13N3O3,6.0,1.0,0.0,InChI=1S/C17H13N3O3/c1-11-2-4-12(5-3-11)16(22)13-6-8-14(9-7-13)20-17(23)19-15(21)10-18-20/h2-10H...,PSOPUAQFGCRDIP-UHFFFAOYSA-N,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,5,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.63,2.57,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0,InChI=1S/C17H12ClN3O3/c1-10-8-13(21-17(24)20-15(22)9-19-21)6-7-14(10)16(23)11-2-4-12(18)5-3-11/h...,KEZNSCMBVRNOHO-UHFFFAOYSA-N,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


In [31]:
df_cpd_props_unique = df_cpd_props.drop_duplicates()

In [32]:
# df_cpd_props_unique.to_csv(path_results+"ChEMBL"+chembl_version+"_cpd_props.csv", sep = ';', index = False)

## Combine initial query with compound properties

In [33]:
df_combined = df_combined.merge(df_cpd_props_unique, on = 'parent_molregno', how = 'inner')

In [34]:
# df_combined.to_csv(path_results+"ChEMBL"+chembl_version+"_cpds_with_props.csv", sep = ';', index = False)

In [35]:
############### TESTING: compound props ###############
add_intersections(df_combined, "cpd props")

# Calculate LE metrics

Ligand Efficiency (LE):

$\text{LE} = \frac{\Delta\text{G}}{\text{HA}}$
where $ \Delta\text{G} = − RT \ln(K_d)$, $− RT\ln(K_i)$, or $− RT\ln(IC_{50})$

$\text{LE}=\frac{(2.303 \cdot 298 \cdot 0.00199 \cdot \text{pchembl_value})} {\text{heavy_atoms}}$


$\text{BEI}=\frac{\text{pchembl_mean} \cdot 1000} {\text{mw_freebase}}$

$\text{SEI}=\frac{\text{pchembl_mean} \cdot 100} {\text{PSA}}$

$\text{LLE}=\text{pchembl_mean}-\text{ALOGP}$

In [36]:
df_combined['LE'] = df_combined['pchembl_value_mean']/df_combined['heavy_atoms']*(2.303*298*0.00199)
df_combined['BEI'] = df_combined['pchembl_value_mean']*1000/df_combined["mw_freebase"]
df_combined['SEI'] = df_combined['pchembl_value_mean']*100/df_combined["psa"]
df_combined['LLE'] = df_combined['pchembl_value_mean']-df_combined["alogp"]

# Add Descriptors

In [37]:
# available descriptors
print([x[0] for x in Descriptors._descList])

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'Slo

In [38]:
# PandasTools.AddMoleculeColumnToFrame(df_combined,'canonical_smiles','mol',includeFingerprints=False)

# df_combined.loc[:,'FractionCSP3'] = df_combined['mol'].apply(Descriptors.FractionCSP3)
# df_combined.loc[:,'NumAliphaticCarbocycles'] = df_combined['mol'].apply(Descriptors.NumAliphaticCarbocycles)
# df_combined.loc[:,'NumAliphaticHeterocycles'] = df_combined['mol'].apply(Descriptors.NumAliphaticHeterocycles)
# df_combined.loc[:,'NumAliphaticRings'] = df_combined['mol'].apply(Descriptors.NumAliphaticRings)
# df_combined.loc[:,'NumAromaticCarbocycles'] = df_combined['mol'].apply(Descriptors.NumAromaticCarbocycles)
# df_combined.loc[:,'NumAromaticHeterocycles'] = df_combined['mol'].apply(Descriptors.NumAromaticHeterocycles)
# df_combined.loc[:,'NumAromaticRings'] = df_combined['mol'].apply(Descriptors.NumAromaticRings)
# df_combined.loc[:,'NumHeteroatoms'] = df_combined['mol'].apply(Descriptors.NumHeteroatoms)
# df_combined.loc[:,'NumSaturatedCarbocycles'] = df_combined['mol'].apply(Descriptors.NumSaturatedCarbocycles)
# df_combined.loc[:,'NumSaturatedHeterocycles'] = df_combined['mol'].apply(Descriptors.NumSaturatedHeterocycles)
# df_combined.loc[:,'NumSaturatedRings'] = df_combined['mol'].apply(Descriptors.NumSaturatedRings)
# df_combined.loc[:,'RingCount'] = df_combined['mol'].apply(Descriptors.RingCount)

In [39]:
# # drop the column with RDKit molecules
# df_combined = df_combined.drop(['mol'] , axis=1)
# len(df_combined)

# Add scaffold smiles

In [40]:
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

# note: this takes a few minutes to calculate for all molecules
def calculate_scaffolds(smiles_set):
    scaffolds_dict = dict()
    scaffolds_no_stereo_dict = dict()
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        if Chem.rdMolDescriptors.CalcNumRings(mol) == 0:
            continue

        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_dict[smiles] = Chem.MolToSmiles(scaffold)
        
        # repeat after removing stereochemistry
        Chem.RemoveStereochemistry(mol)
        scaffold_no_stereo = MurckoScaffold.GetScaffoldForMol(mol)
        scaffolds_no_stereo_dict[smiles] = Chem.MolToSmiles(scaffold_no_stereo)
        
    return scaffolds_dict, scaffolds_no_stereo_dict

In [41]:
# smiles_set = set(df_combined["canonical_smiles"])
# scaffolds_dict, scaffolds_no_stereo_dict = calculate_scaffolds(smiles_set)

# df_combined['scaffold_wo_stereo'] = df_combined['canonical_smiles'].map(scaffolds_no_stereo_dict)
# df_combined["scaffold_w_stereo"] = df_combined['canonical_smiles'].map(scaffolds_dict)

# Filter for targets (all assay types) with at least 100 compounds per target

## At least 100 compounds per target

In [42]:
# consider all assay types
df_combined_all_assays = df_combined[(df_combined['only_binding'] == False)]

In [43]:
min_nof_cpds = 100

comparator_counts = df_combined_all_assays.groupby(['tid_mutation'])['parent_molregno'].count()
targets_w_enough_cpds = comparator_counts[comparator_counts >= min_nof_cpds].index.tolist()
df_combined_all_assays_100 = df_combined_all_assays.query('tid_mutation in @targets_w_enough_cpds')

In [44]:
# df_combined_all_assays.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays.csv", sep = ";", index = False)
# df_combined_all_assays_100.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100.csv", sep = ";", index = False)

# df_combined_all_assays.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays.xlsx", index = False)
# df_combined_all_assays_100.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100.xlsx", index = False)

## Only drug targets 

In [45]:
d_dt_targets = set(df_combined_all_assays_100[df_combined_all_assays_100['DTI'] == 'D_DT'].tid_mutation.to_list())
df_d_dt_targets = df_combined_all_assays_100.query('tid_mutation in @d_dt_targets')

In [46]:
# df_d_dt_targets.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100_d_dt.csv", sep = ";", index = False)
# df_d_dt_targets.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_all_assays_100_d_dt.xlsx", index = False)

In [47]:
############### TESTING: all assay types ###############
add_intersections(df_combined_all_assays, "all assays")
add_intersections(df_combined_all_assays_100, "all, >= 100")
add_intersections(df_d_dt_targets, "all, >= 100, d_dt")

In [48]:
############### TESTING ###############
print("{:53} {}".format("Targets with >= 100 comparators:", len(targets_w_enough_cpds)))  
print("{:53} {}\n".format("Targets with >= 100 comparators and d_dt assignment:", len(d_dt_targets)))

for max_phase in range(1, 5):
    print("Phase: ", max_phase)
    df_phase = df_combined_all_assays_100[df_combined_all_assays_100['max_phase'] == max_phase]
    df_d_dt_targets_with_phase = df_d_dt_targets[df_d_dt_targets['max_phase'] == max_phase]
    print("{:50} {}: {}".format("#Targets with annotated cpds in max_phase", max_phase, df_phase.tid_mutation.nunique()))
    print("{:50} {}: {}\n".format("#D_DT targets with annotated cpds in max_phase", max_phase, df_d_dt_targets_with_phase.tid_mutation.nunique()))

Targets with >= 100 comparators:                      564
Targets with >= 100 comparators and d_dt assignment:  294

Phase:  1
#Targets with annotated cpds in max_phase          1: 295
#D_DT targets with annotated cpds in max_phase     1: 167

Phase:  2
#Targets with annotated cpds in max_phase          2: 413
#D_DT targets with annotated cpds in max_phase     2: 235

Phase:  3
#Targets with annotated cpds in max_phase          3: 411
#D_DT targets with annotated cpds in max_phase     3: 237

Phase:  4
#Targets with annotated cpds in max_phase          4: 294
#D_DT targets with annotated cpds in max_phase     4: 294



# Filter for targets (only binding assays) with at least 100 comparator compounds

## At least 100 compounds per target

In [49]:
# consider only binding assays and therapeutic targets
df_comb_B = df_combined[(df_combined['only_binding'] == True)]

In [50]:
min_nof_cpds = 100

comparator_counts_B = df_comb_B.groupby(['tid_mutation'])['parent_molregno'].count()
targets_w_enough_cpds_B = comparator_counts_B[comparator_counts_B >= min_nof_cpds].index.tolist()
df_comb_B_100 = df_comb_B.query('tid_mutation in @targets_w_enough_cpds_B')

In [51]:
# df_comb_B.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays.csv", sep = ";", index = False)
# df_comb_B_100.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100.csv", sep = ";", index = False)

# df_comb_B.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays.xlsx", index = False)
# df_comb_B_100.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100.xlsx", index = False)

## Only drug targets 

In [52]:
d_dt_targets_B = set(df_comb_B_100[df_comb_B_100['DTI'] == 'D_DT'].tid_mutation.to_list())
df_d_dt_targets_B = df_comb_B_100.query('tid_mutation in @d_dt_targets_B')

In [53]:
# df_d_dt_targets_B.to_csv(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100_d_dt.csv", sep = ";", index = False)
# df_d_dt_targets_B.to_excel(path_results+"ChEMBL"+chembl_version+"_DTI_binding_assays_100_d_dt.xlsx", index = False)

In [54]:
############### TESTING: binding assays ###############
add_intersections(df_comb_B, "binding")
add_intersections(df_comb_B_100, "b, >= 100")
add_intersections(df_d_dt_targets_B, "b, >= 100, d_dt")

In [55]:
############### TESTING ###############
print("{:53} {}".format("Targets with >= 100 comparators:", len(targets_w_enough_cpds_B)))  
print("{:53} {}\n".format("Targets with >= 100 comparators and d_dt assignment:", len(d_dt_targets_B)))

for max_phase in range(1, 5):
    print("Phase: ", max_phase)
    df_phase = df_comb_B_100[df_comb_B_100['max_phase'] == max_phase]
    df_d_dt_targets_with_phase = df_d_dt_targets_B[df_d_dt_targets_B['max_phase'] == max_phase]
    print("{:50} {}: {}".format("#Targets with annotated cpds in max_phase", max_phase, df_phase.tid_mutation.nunique()))
    print("{:50} {}: {}\n".format("#D_DT targets with annotated cpds in max_phase", max_phase, df_d_dt_targets_with_phase.tid_mutation.nunique()))

Targets with >= 100 comparators:                      534
Targets with >= 100 comparators and d_dt assignment:  279

Phase:  1
#Targets with annotated cpds in max_phase          1: 274
#D_DT targets with annotated cpds in max_phase     1: 156

Phase:  2
#Targets with annotated cpds in max_phase          2: 381
#D_DT targets with annotated cpds in max_phase     2: 218

Phase:  3
#Targets with annotated cpds in max_phase          3: 380
#D_DT targets with annotated cpds in max_phase     3: 223

Phase:  4
#Targets with annotated cpds in max_phase          4: 279
#D_DT targets with annotated cpds in max_phase     4: 279



# Overview of comparisons to old ChEMBL26 dataset

In [56]:
############### TESTING: development of intersection(curr_data, old_dataset) ###############
print("intersection(curr_data, old_dataset)")
pd.DataFrame(all_intersect_comp,
                   columns=['type', 'mols', 'drugs', 'targets', 'drug_targets', 'cpd_target', 'drug_target'])

intersection(curr_data, old_dataset)


Unnamed: 0,type,mols,drugs,targets,drug_targets,cpd_target,drug_target
0,comparison,378661,686,910,351,561772,1221
1,init,378661,616,907,349,560759,1120
2,drug_mechanisms mapped,0,629,895,0,0,1113
3,pre DTI,378661,616,907,349,560759,1120
4,post DTI,375142,595,889,335,551887,1075
5,cpd props,375142,595,889,335,551887,1075
6,all assays,375142,595,889,335,551887,1075
7,"all, >= 100",370077,558,549,268,542886,987
8,"all, >= 100, d_dt",239768,558,287,268,355397,987
9,binding,375116,595,889,335,551886,1075


In [57]:
############### TESTING: development of intersection(curr_data, old_dataset) limited to >= 100 cpds ###############
print("intersection(curr_data, old_dataset) limited to >= 100 cpds")
pd.DataFrame(all_intersect_comp_100,
                   columns=['type', 'mols', 'drugs', 'targets', 'drug_targets', 'cpd_target', 'drug_target'])

intersection(curr_data, old_dataset) limited to >= 100 cpds


Unnamed: 0,type,mols,drugs,targets,drug_targets,cpd_target,drug_target
0,comparison_100,372452,643,525,271,550227,1104
1,init,372452,573,524,269,549279,1007
2,pre DTI,372452,573,524,269,549279,1007
3,post DTI,369132,556,519,259,541154,973
4,cpd props,369132,556,519,259,541154,973
5,all assays,369132,556,519,259,541154,973
6,"all, >= 100",368888,555,517,258,540990,972
7,"all, >= 100, d_dt",239420,555,274,258,354656,972
8,binding,369107,556,519,259,541153,973
9,"b, >= 100",368772,554,514,256,540711,970


In [58]:
############### TESTING: development of intersection(curr_data, old_dataset) limited to >= 100 cpds, d_dt targets ###############
print("intersection(curr_data, old_dataset) limited to >= 100 cpds, d_dt targets")
pd.DataFrame(all_intersect_comp_d_dt,
                   columns=['type', 'mols', 'drugs', 'targets', 'drug_targets', 'cpd_target', 'drug_target'])

intersection(curr_data, old_dataset) limited to >= 100 cpds, d_dt targets


Unnamed: 0,type,mols,drugs,targets,drug_targets,cpd_target,drug_target
0,comparison_d_dt,238281,643,271,271,360048,1104
1,init,238281,573,270,269,359302,1007
2,pre DTI,238281,573,270,269,359302,1007
3,post DTI,237500,556,268,259,355378,973
4,cpd props,237500,556,268,259,355378,973
5,all assays,237500,556,268,259,355378,973
6,"all, >= 100",237419,555,267,258,355311,972
7,"all, >= 100, d_dt",231319,555,258,258,346890,972
8,binding,237485,556,268,259,355377,973
9,"b, >= 100",237317,554,265,256,355128,970


In [59]:
############### TESTING: development of size(curr_data) vs. size(old_dataset) ###############
print("Size(curr_data) vs. Size(old_dataset)")
pd.DataFrame(all_length_comp,
                   columns=['type', 'mols', 'drugs', 'targets', 'drug_targets', 'cpd_target', 'drug_target'])

Size(curr_data) vs. Size(old_dataset)


Unnamed: 0,type,mols,drugs,targets,drug_targets,cpd_target,drug_target
0,comparison,378661,686,910,351,561772,1221
1,comparison_100,372452,643,525,271,550227,1104
2,comparison_d_dt,238281,643,271,271,360048,1104
3,init,923862,1637,6893,2569,2094029,21601
4,pre DTI,923862,1637,6893,2569,2094029,21601
5,post DTI,479546,693,1349,481,720874,1406
6,cpd props,479110,691,1349,481,720205,1404
7,all assays,479110,691,1349,481,720205,1404
8,"all, >= 100",472889,650,564,294,706746,1147
9,"all, >= 100, d_dt",301749,650,294,294,446409,1147


## Analyse differences in pchembl_mean

In [60]:
def round_decimal(x):
    return str(round(x+10**(-len(str(x))-1), 2))

def identify_mean_differences(data):
    problems = 0
    targets_with_issues = set()
    for index, row in data.iterrows():
        now_mean = round_decimal(float(row['pchembl_value_mean']))
        comp_mean = round_decimal(float(row['PCHEMBL_MEAN']))
        if now_mean != comp_mean:
            problems += 1
            targets_with_issues.add(row['tid_mutation'])

    print("Rows:", len(data))
    print("Mean differences:", problems)
    print("#Targets:", len(set(data['tid_mutation'])))
    print("#Targets with differences:", len(targets_with_issues))

In [61]:
a = df_comb_B_100[['parent_molregno', 'tid', 'tid_mutation', 'pchembl_value_mean', 'pchembl_value_max', 'pchembl_value_median']].copy()
a['PAIRS'] = a.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1)
a = a.drop_duplicates()
a = a.sort_values(by=['parent_molregno', 'tid_mutation', 'pchembl_value_mean']).reset_index(drop=True)
b = df_comparison[['PCHEMBL_MEAN', 'PCHEMBL_MAX', 'PCHEMBL_MEDIAN', 'PAIRS']].copy()
overlapping_data = a.merge(b, on='PAIRS', how = 'inner')
overlapping_data

Unnamed: 0,parent_molregno,tid,tid_mutation,pchembl_value_mean,pchembl_value_max,pchembl_value_median,PAIRS,PCHEMBL_MEAN,PCHEMBL_MAX,PCHEMBL_MEDIAN
0,23,11291,11291,6.100,6.10,6.100,23_11291,6.10,6.10,6.10
1,39,11628,11628,5.920,5.92,5.920,39_11628,5.92,5.92,5.92
2,46,11628,11628,4.100,4.10,4.100,46_11628,4.10,4.10,4.10
3,53,11628,11628,4.090,4.09,4.090,53_11628,4.09,4.09,4.09
4,97,103,103,9.430,9.70,9.485,97_103,9.43,9.70,9.49
...,...,...,...,...,...,...,...,...,...,...
541208,2340980,12687,12687,6.300,6.30,6.300,2340980_12687,6.30,6.30,6.30
541209,2340980,12689,12689,7.300,7.30,7.300,2340980_12689,7.30,7.30,7.30
541210,2340981,104,104,6.200,6.20,6.200,2340981_104,6.20,6.20,6.20
541211,2340985,71,71,4.495,4.62,4.495,2340985_71,4.50,4.62,4.50


In [62]:
# the differences to the last version are due to the order of the rows when calulating mean p-chembl values
identify_mean_differences(overlapping_data)

Rows: 541213
Mean differences: 2074
#Targets: 521
#Targets with differences: 311


In [63]:
a = df_comb_B[df_comb_B["max_phase"] == 4][['parent_molregno', 'tid', 'tid_mutation', 'pchembl_value_mean', 'pchembl_value_max', 'pchembl_value_median']].copy()
a['PAIRS'] = a.agg('{0[parent_molregno]}_{0[tid_mutation]}'.format, axis=1)
a = a.drop_duplicates()
a = a.sort_values(by=['parent_molregno', 'tid_mutation', 'pchembl_value_mean']).reset_index(drop=True)
now_drug_pairs = set(a['PAIRS'])
comp_drug_pairs = set(df_comparison[df_comparison['DTI'] == 'D_DT']["PAIRS"])
intersect = now_drug_pairs.intersection(comp_drug_pairs)
overlapping_data = a.merge(df_comparison[df_comparison['DTI'] == 'D_DT'], on='PAIRS', how = 'inner')
limited_overlapping = overlapping_data[overlapping_data['PAIRS'].isin(intersect)]
identify_mean_differences(limited_overlapping)

Rows: 1075
Mean differences: 46
#Targets: 332
#Targets with differences: 36
