# Add Information to Paul’s Oral Drug Set

### Authors: Barbara Zdrazil, Lina Heinzke
### 02/2023

In [1]:
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools

In [2]:
# notebook settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

In [3]:
chembl_version = '32'
base_path = '/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/'
path_results = base_path+'results/'
path_sqlite3_database = base_path+'data/chembl_'+chembl_version+'/chembl_'+chembl_version+'_sqlite/chembl_'+chembl_version+'.db'
chembl_con = sqlite3.connect(path_sqlite3_database)
oral_drugs_path = base_path+'data/2023_02_01_Oral_drug_set.xlsx'

# Define Methods to Process Dataset

## Map SMILES to ChEMBL pref_name and id

Get ChEMBL mapping from smiles to pref_name and id.

In [4]:
# Map smiles to salt smiles and parent smiles and parent info
sql = '''
SELECT DISTINCT mh.molregno as salt_molregno, mh.parent_molregno, 
    md_parent.chembl_id as parent_chembl_id, md_parent.pref_name as parent_pref_name, 
    md_salt.pref_name as salt_pref_name, 
    struct_salt.canonical_smiles as salt_smiles, struct_parent.canonical_smiles as parent_smiles
FROM molecule_hierarchy mh
INNER JOIN molecule_dictionary md_salt
    ON mh.molregno = md_salt.molregno                   -- compound information based on salt form
INNER JOIN molecule_dictionary md_parent
    ON mh.parent_molregno = md_parent.molregno          -- compound information based on parent compound
INNER JOIN compound_structures struct_salt
    ON mh.molregno = struct_salt.molregno               -- salt structures
INNER JOIN compound_structures struct_parent
    ON mh.parent_molregno = struct_parent.molregno      -- parent structures
'''

df_cpd_struct = pd.read_sql_query(sql, con=chembl_con)
df_cpd_struct.head()

Unnamed: 0,salt_molregno,parent_molregno,parent_chembl_id,parent_pref_name,salt_pref_name,salt_smiles,parent_smiles
0,1,1,CHEMBL6329,,,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccccc1Cl
1,2,2,CHEMBL6328,,,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(C#N)cc1
2,3,3,CHEMBL265667,,,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1,Cc1cc(-n2ncc(=O)[nH]c2=O)cc(C)c1C(O)c1ccc(Cl)cc1
3,4,4,CHEMBL6362,,,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1,Cc1ccc(C(=O)c2ccc(-n3ncc(=O)[nH]c3=O)cc2)cc1
4,5,5,CHEMBL267864,,,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1,Cc1cc(-n2ncc(=O)[nH]c2=O)ccc1C(=O)c1ccc(Cl)cc1


Method to map SMILES to canonical SMILES (RDKit-based). Returns canonical smiles and bool whether the calculation was successful. 

In [5]:
def to_canonical(smiles_set):
    smiles_dict = dict()
    smiles_no_stereo_dict = dict()
    smiles_w_problems = set()
    
    for smiles in tqdm(smiles_set):
        try:
            mol = Chem.MolFromSmiles(smiles)
            canonical_smiles = Chem.MolToSmiles(mol)
            smiles_dict[smiles] = canonical_smiles
        except:
            canonical_smiles = smiles
            smiles_dict[smiles] = canonical_smiles
            smiles_w_problems.add(canonical_smiles)
        
        try:
            Chem.RemoveStereochemistry(mol)
            canonical_no_stereo_smiles = Chem.MolToSmiles(mol)
            smiles_no_stereo_dict[smiles] = canonical_no_stereo_smiles
        except:
            canonical_smiles = smiles
            smiles_no_stereo_dict[smiles] = canonical_smiles
            smiles_w_problems.add(canonical_smiles)

    return (smiles_dict, smiles_no_stereo_dict, smiles_w_problems)

Map ChEMBL canonical_smiles to RDKit-based canonical SMILES.

In [6]:
salt_smiles_set = set(df_cpd_struct[df_cpd_struct['salt_smiles'].notnull()]['salt_smiles'])
parent_smiles_set = set(df_cpd_struct[df_cpd_struct['parent_smiles'].notnull()]['parent_smiles'])
smiles_set = salt_smiles_set.union(parent_smiles_set)
smiles_dict, smiles_no_stereo_dict, smiles_w_problems = to_canonical(smiles_set)
print('#Problems:', len(smiles_w_problems))
print(smiles_w_problems)

df_cpd_struct['rdkit_salt_smiles'] = df_cpd_struct['salt_smiles'].map(smiles_dict)
df_cpd_struct['rdkit_salt_no_stereo_smiles'] = df_cpd_struct['salt_smiles'].map(smiles_no_stereo_dict)
df_cpd_struct['rdkit_parent_smiles'] = df_cpd_struct['parent_smiles'].map(smiles_dict)
df_cpd_struct['rdkit_parent_no_stereo_smiles'] = df_cpd_struct['parent_smiles'].map(smiles_no_stereo_dict)

# df_cpd_struct[["rdkit_salt_smiles", "rdkit_salt_problem"]] = df_cpd_struct.apply(lambda row: to_canonical(row.salt_smiles), 
#                                                                            axis='columns', result_type='expand')
# df_cpd_struct[["rdkit_parent_smiles", "rdkit_parent_problem"]] = df_cpd_struct.apply(lambda row: to_canonical(row.parent_smiles), 
#                                                                            axis='columns', result_type='expand')

 78%|████████████████████████▎      | 1825419/2327784 [05:12<01:25, 5902.51it/s][16:56:44] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 10 11 15 16 17 19 20 21

100%|███████████████████████████████| 2327784/2327784 [06:37<00:00, 5850.34it/s]


#Problems: 1
{'Cc1ccc2c(c1)-n1-c(=O)/c=c\\c(=O)-n-2-c2cc(C)ccc2-1'}


Method to calculate mapping from given SMILES to ChEMBL ID (molregno + ChEMBL ID) and pref_name. A SMILES might map to more than one ID.

In [7]:
def get_parent_info(parent_info):
    chembl_molregno = list((parent_info['parent_molregno']))
    chembl_id = list((parent_info['parent_chembl_id']))
    chembl_name = list((parent_info['parent_pref_name']))
    chembl_salt_smiles = list((parent_info['salt_smiles']))
    chembl_parent_smiles = list((parent_info['parent_smiles']))
    return chembl_molregno, chembl_id, chembl_name, chembl_salt_smiles, chembl_parent_smiles


def to_chembl_id(oral_drug_smiles, oral_drug_no_stereo_smiles, compound_name):
    smiles_found = False
    
    if oral_drug_smiles in set(df_cpd_struct["rdkit_salt_smiles"]):
        smiles_found = True
        found_type = 'salt_smiles'
        parent_info = df_cpd_struct[df_cpd_struct["rdkit_salt_smiles"] == oral_drug_smiles]
    elif oral_drug_smiles in set(df_cpd_struct["rdkit_parent_smiles"]):
        smiles_found = True
        found_type = 'parent_smiles'
        parent_info = df_cpd_struct[df_cpd_struct["rdkit_parent_smiles"] == oral_drug_smiles]
    elif oral_drug_no_stereo_smiles in set(df_cpd_struct["rdkit_salt_smiles"]):
        smiles_found = True
        found_type = 'salt_no_stereo_smiles'
        parent_info = df_cpd_struct[df_cpd_struct["rdkit_salt_smiles"] == oral_drug_no_stereo_smiles]
    elif oral_drug_no_stereo_smiles in set(df_cpd_struct["rdkit_parent_smiles"]):
        smiles_found = True
        found_type = 'parent_no_stereo_smiles'
        parent_info = df_cpd_struct[df_cpd_struct["rdkit_parent_smiles"] == oral_drug_no_stereo_smiles]
    elif compound_name.upper() in set(df_cpd_struct['parent_pref_name']):
        smiles_found = True
        found_type = 'name'
        parent_info = df_cpd_struct[df_cpd_struct['parent_pref_name'] == compound_name.upper()]
        if len(parent_info) > 1:
            parent_info = parent_info[parent_info['salt_molregno'] == parent_info['parent_molregno']]
    elif compound_name.upper() in set(df_cpd_struct['salt_pref_name']):
        smiles_found = True
        found_type = 'salt_name'
        parent_info = df_cpd_struct[df_cpd_struct['salt_pref_name'] == compound_name.upper()]
        if len(parent_info) > 1:
            parent_info = parent_info[parent_info['salt_molregno'] == parent_info['parent_molregno']]
    
    if smiles_found:
        chembl_molregno, chembl_id, chembl_name, chembl_salt_smiles, chembl_parent_smiles = get_parent_info(parent_info)
        # output if there is more than one compound in ChEMBL for a SMILES
        if len(chembl_molregno) > 1:
            print(oral_drug_smiles, chembl_molregno, chembl_id, chembl_name)
        return chembl_molregno, chembl_id, chembl_name, chembl_salt_smiles, chembl_parent_smiles, found_type
    else:
        return None, None, None, None, None, None

## Map Compounds to Targets in the drug_mechanism Table

Get interacting compound-target pairs from the drug_mechanism table.

In [8]:
sql = '''
SELECT DISTINCT mh.parent_molregno, dm.tid, md.chembl_id, td.pref_name
FROM drug_mechanism dm
INNER JOIN molecule_hierarchy mh
    ON dm.molregno = mh.molregno
INNER JOIN molecule_dictionary md
    ON mh.parent_molregno = md.molregno
INNER JOIN target_dictionary td
    ON dm.tid = td.tid
WHERE dm.disease_efficacy = 1
    and dm.tid is not null
'''

df_dti = pd.read_sql_query(sql, con=chembl_con)
df_dti

Unnamed: 0,parent_molregno,tid,chembl_id,pref_name
0,1124,11060,CHEMBL19,Carbonic anhydrase VII
1,675068,10193,CHEMBL1201117,Carbonic anhydrase I
2,1125,10193,CHEMBL20,Carbonic anhydrase I
3,1085,10193,CHEMBL17,Carbonic anhydrase I
4,1124,10193,CHEMBL19,Carbonic anhydrase I
...,...,...,...,...
6258,1407411,112,CHEMBL2135460,Vasopressin V2 receptor
6259,51961,120553,CHEMBL37161,Tubulin beta chain
6260,51961,120554,CHEMBL37161,Tubulin beta chain
6261,442342,22228,CHEMBL272427,Molecular identity unknown


Method to calculate mapping from compound ChEMBL id to targets with which they are known to interact based on the drug_mechanism table.  
Returns target pref_names and ids and bool whether the compound is in the drug_mechanism table. There can be multiple target pref_names / ids per compound.

In [9]:
def in_dm_table(oral_drug_id):
    if oral_drug_id in set(df_dti['parent_molregno']):
        target_names = list(df_dti[df_dti["parent_molregno"] == oral_drug_id]['pref_name'])
        tids = list(df_dti[df_dti["parent_molregno"] == oral_drug_id]['tid'])
        target_chembl_ids = list(df_dti[df_dti["parent_molregno"] == oral_drug_id]['chembl_id'])
        return True, target_names, tids, target_chembl_ids
    else:
        return False, None, None, None

## Methods to Add Target Class Annotations Based on ChEMBL Data

Add information about level 1 and level 2 target class annotations in ChEMBL.

In [10]:
sql = '''
SELECT DISTINCT tc.tid, 
    pc.protein_class_id, pc.pref_name, pc.short_name, pc.protein_class_desc, pc.definition
FROM protein_classification pc
-- join several tables to get the corresponding target id
INNER JOIN component_class cc
    ON pc.protein_class_id = cc.protein_class_id
INNER JOIN component_sequences cs
    ON cc.component_id = cs.component_id
INNER JOIN target_components tc
    ON cs.component_id = tc.component_id
'''

df_target_classes = pd.read_sql_query(sql, con=chembl_con)
df_target_classes

Unnamed: 0,tid,protein_class_id,pref_name,short_name,protein_class_desc,definition
0,1,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond
1,2,1133,ABCC subfamily,MRP,transporter ntpase atp binding cassette mrp,A sequence-related subfamily of ATP-BINDING CASSETTE TRANSPORTERS that actively transport organi...
2,3,104,Phosphodiesterase 5A,PDE_5A,enzyme phosphodiesterase pde_5 pde_5a,
3,4,1583,Voltage-gated calcium channel,VG CA,ion channel vgc vg ca,Voltage-dependent cell membrane glycoproteins selectively permeable to calcium ions. They are ca...
4,5,422,Nicotinic acetylcholine receptor alpha subunit,CHRN alpha,ion channel lgic ach chrn alpha,
...,...,...,...,...,...,...
11700,120595,601,Unclassified protein,Unclassified,unclassified,
11701,120596,601,Unclassified protein,Unclassified,unclassified,
11702,120597,601,Unclassified protein,Unclassified,unclassified,
11703,120598,601,Unclassified protein,Unclassified,unclassified,


Query the protein_classification table for the protein classification hierarchy and merge it with the target class information for specific tids.

In [11]:
sql = '''
WITH RECURSIVE pc_hierarchy AS (
    SELECT protein_class_id,
            parent_id,
            class_level,
            pref_name AS names
    FROM protein_classification
    WHERE parent_id IS NULL

    UNION ALL
   
    SELECT pc.protein_class_id,
        pc.parent_id,
        pc.class_level,
        -- recursively add current protein classification pref_name to string, separated by |
        pc_hierarchy.names || '|' || pc.pref_name 
    FROM protein_classification pc, pc_hierarchy
    WHERE pc.parent_id = pc_hierarchy.protein_class_id
)
SELECT *
FROM pc_hierarchy
'''


target_class_hierarchy = pd.read_sql_query(sql, con=chembl_con)
target_class_hierarchy[['l0', 'l1', 'l2', 'l3', 'l4', 'l5', 'l6']] = target_class_hierarchy['names'].str.split('|', expand=True)
target_class_hierarchy = target_class_hierarchy[target_class_hierarchy['protein_class_id'] != 0][['protein_class_id', 'l1', 'l2']]
df_target_classes = df_target_classes.merge(target_class_hierarchy, on = 'protein_class_id', how = 'left')
df_target_classes

Unnamed: 0,tid,protein_class_id,pref_name,short_name,protein_class_desc,definition,l1,l2
0,1,646,Hydrolase,Hydrolase,enzyme hydrolase,A group of enzymes that catalyze the hydrolysis of a chemical bond,Enzyme,Hydrolase
1,2,1133,ABCC subfamily,MRP,transporter ntpase atp binding cassette mrp,A sequence-related subfamily of ATP-BINDING CASSETTE TRANSPORTERS that actively transport organi...,Transporter,Primary active transporter
2,3,104,Phosphodiesterase 5A,PDE_5A,enzyme phosphodiesterase pde_5 pde_5a,,Enzyme,Phosphodiesterase
3,4,1583,Voltage-gated calcium channel,VG CA,ion channel vgc vg ca,Voltage-dependent cell membrane glycoproteins selectively permeable to calcium ions. They are ca...,Ion channel,Voltage-gated ion channel
4,5,422,Nicotinic acetylcholine receptor alpha subunit,CHRN alpha,ion channel lgic ach chrn alpha,,Ion channel,Ligand-gated ion channel
...,...,...,...,...,...,...,...,...
11700,120595,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
11701,120596,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
11702,120597,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,
11703,120598,601,Unclassified protein,Unclassified,unclassified,,Unclassified protein,


Summarise the information for a target id with several assigned target classes of level 1 into one description. If a target id has more than one assigned target class, the target class 'Unclassified protein' is discarded.

In [12]:
level = 'l1'
between_str_join = '|'
target_classes_level1 = df_target_classes[['tid', level]].drop_duplicates().dropna()

# remove 'Unclassified protein' from targets with more than one target class, level 1
nof_classes = target_classes_level1.groupby(['tid'])[level].count()
target_classes_level1 = target_classes_level1[
    (target_classes_level1['tid'].isin(nof_classes[nof_classes == 1].index.tolist())) 
    | ((target_classes_level1['tid'].isin(nof_classes[nof_classes > 1].index.tolist())) 
       & (target_classes_level1['l1'] != 'Unclassified protein'))]

target_classes_level1['target_class_l1'] = target_classes_level1.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted(x)))
target_classes_level1 = target_classes_level1[['tid', 'target_class_l1']].drop_duplicates()

Repeat the summary step for target classes of level 2.

In [13]:
level = 'l2'
target_classes_level2 = df_target_classes[['tid', level]].drop_duplicates().dropna()
target_classes_level2['target_class_l2'] = target_classes_level2.groupby(['tid'])[level].transform(lambda x: between_str_join.join(sorted(x)))
target_classes_level2 = target_classes_level2[['tid', 'target_class_l2']].drop_duplicates()

## Add ChEMBL Compound Properties and Compound Structures Based on ChEMBL ID

Add compound properties and structures based on the compound_properties table and the compound_structures table. 

In [14]:
sql = '''
SELECT DISTINCT mh.parent_molregno, 
    cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
    cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
    cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
    cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations
FROM compound_properties cp
INNER JOIN molecule_hierarchy mh
    ON cp.molregno = mh.parent_molregno
'''

df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
df_cpd_props = df_cpd_props.astype({'parent_molregno': 'Int64'})
df_cpd_props.head()

Unnamed: 0,parent_molregno,mw_freebase,alogp,hba,hbd,psa,rtb,ro3_pass,num_ro5_violations,cx_most_apka,cx_most_bpka,cx_logp,cx_logd,molecular_species,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,full_molformula,hba_lipinski,hbd_lipinski,num_lipinski_ro5_violations
0,1,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.48,,3.63,2.69,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0
1,2,332.32,1.33,6.0,1.0,108.61,3.0,N,0.0,6.33,,2.88,1.82,ACID,332.32,3.0,25.0,0.73,332.0909,C18H12N4O3,7.0,1.0,0.0
2,3,357.8,2.27,5.0,2.0,87.98,3.0,N,0.0,6.33,,3.7,2.64,ACID,357.8,3.0,25.0,0.75,357.088,C18H16ClN3O3,6.0,2.0,0.0
3,4,307.31,1.46,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.02,1.97,ACID,307.31,3.0,23.0,0.74,307.0957,C17H13N3O3,6.0,1.0,0.0
4,5,341.75,2.11,5.0,1.0,84.82,3.0,N,0.0,6.33,,3.63,2.57,ACID,341.75,3.0,24.0,0.74,341.0567,C17H12ClN3O3,6.0,1.0,0.0


## Methods to Add RDKit-Based Compound Descriptors

## Built-in Compound Descriptors

Add relevant compound descriptors using built-in RDKit methods. 

In [15]:
def add_RDKit_columns(oral_drugs):
    # add a column with RDKit molecules, used to calculate the descriptors
    PandasTools.AddMoleculeColumnToFrame(oral_drugs, smiles_column, 'mol', includeFingerprints=False)

    oral_drugs.loc[:,'fraction_csp3'] = oral_drugs['mol'].apply(Descriptors.FractionCSP3)
    oral_drugs.loc[:,'num_aliphatic_carbocycles'] = oral_drugs['mol'].apply(Descriptors.NumAliphaticCarbocycles)
    oral_drugs.loc[:,'num_aliphatic_heterocycles'] = oral_drugs['mol'].apply(Descriptors.NumAliphaticHeterocycles)
    oral_drugs.loc[:,'num_aliphatic_rings'] = oral_drugs['mol'].apply(Descriptors.NumAliphaticRings)
    oral_drugs.loc[:,'num_aromatic_carbocycles'] = oral_drugs['mol'].apply(Descriptors.NumAromaticCarbocycles)
    oral_drugs.loc[:,'num_aromatic_heterocycles'] = oral_drugs['mol'].apply(Descriptors.NumAromaticHeterocycles)
    oral_drugs.loc[:,'num_aromatic_rings'] = oral_drugs['mol'].apply(Descriptors.NumAromaticRings)
    oral_drugs.loc[:,'num_heteroatoms'] = oral_drugs['mol'].apply(Descriptors.NumHeteroatoms)
    oral_drugs.loc[:,'num_saturated_carbocycles'] = oral_drugs['mol'].apply(Descriptors.NumSaturatedCarbocycles)
    oral_drugs.loc[:,'num_saturated_heterocycles'] = oral_drugs['mol'].apply(Descriptors.NumSaturatedHeterocycles)
    oral_drugs.loc[:,'num_saturated_rings'] = oral_drugs['mol'].apply(Descriptors.NumSaturatedRings)
    oral_drugs.loc[:,'ring_count'] = oral_drugs['mol'].apply(Descriptors.RingCount)
    oral_drugs.loc[:,'num_stereocentres'] = oral_drugs['mol'].apply(Chem.rdMolDescriptors.CalcNumAtomStereoCenters)

    # add scaffolds
    PandasTools.AddMurckoToFrame(oral_drugs, 'mol', 'scaffold_w_stereo')
    # remove stereo information of the molecule to add scaffolds without stereo information
    oral_drugs['mol'].apply(Chem.RemoveStereochemistry)
    PandasTools.AddMurckoToFrame(oral_drugs, 'mol', 'scaffold_wo_stereo')

    # drop the column with RDKit molecules
    return oral_drugs.drop(['mol'] , axis=1)

## Aromaticity Descriptors

Add descriptors for aromaticity, using an RDKit-based method.

In [16]:
def calculate_aromatic_atoms(smiles_set):
    aromatic_atoms_dict = dict()
    aromatic_c_dict = dict()
    aromatic_n_dict = dict()
    aromatic_hetero_dict = dict()
    
    for smiles in tqdm(smiles_set):
        mol = Chem.MolFromSmiles(smiles)
        aromatic_atoms_dict[smiles] = sum(mol.GetAtomWithIdx(i).GetIsAromatic() for i in range(mol.GetNumAtoms()))
        aromatic_c_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 6)) for i in range(mol.GetNumAtoms()))
        aromatic_n_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() == 7)) for i in range(mol.GetNumAtoms()))
        aromatic_hetero_dict[smiles] = sum((mol.GetAtomWithIdx(i).GetIsAromatic() & (mol.GetAtomWithIdx(i).GetAtomicNum() != 6) & (mol.GetAtomWithIdx(i).GetAtomicNum() != 1)) for i in range(mol.GetNumAtoms()))
        
    return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict

In [17]:
def add_aromaticity_columns(oral_drugs):
    # use df_combined_w_smiles to exclude null values
    smiles_set = set(oral_drugs[smiles_column])
    aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = calculate_aromatic_atoms(smiles_set)

    oral_drugs['aromatic_atoms'] = oral_drugs[smiles_column].map(aromatic_atoms_dict)
    oral_drugs['aromatic_c'] = oral_drugs[smiles_column].map(aromatic_c_dict)
    oral_drugs['aromatic_n'] = oral_drugs[smiles_column].map(aromatic_n_dict)
    oral_drugs['aromatic_hetero'] = oral_drugs[smiles_column].map(aromatic_hetero_dict)
    return oral_drugs

# Process Dataset

In [18]:
sheet_names = ['All', 'Additions info', 'Post90 with targets annotated']
smiles_columns = ['SMILES', 'Smiles', 'SMILES']
existing_columns = [11, 9, 8]
results = []

Method to calculate all annotations for a sheet in the excel file.

In [19]:
def get_result(sheet_name, smiles_column):
    # read file
    print('read file')
    oral_drugs = pd.read_excel(oral_drugs_path, sheet_name=sheet_name)
    
    # map smiles to canonical smiles
    print('map smiles to canonical smiles')

    smiles_set = set(oral_drugs[oral_drugs[smiles_column].notnull()][smiles_column])
    smiles_dict, smiles_no_stereo_dict, smiles_w_problems = to_canonical(smiles_set)
    print('#Problems:', len(smiles_w_problems))
    oral_drugs['rdkit_canonical_smiles'] = oral_drugs[smiles_column].map(smiles_dict)
    oral_drugs['rdkit_canonical_no_stereo_smiles'] = oral_drugs[smiles_column].map(smiles_no_stereo_dict)

    # map compound to chembl_id and pref_name
    print('map compound to chembl_id and pref_name')
    oral_drugs[["chembl_parent_molregno", 
                "chembl_parent_id",
                "chembl_parent_pref_name", 
                "chembl_salt_smiles",
                "chembl_parent_smiles",
                "found_type"]] = oral_drugs.apply(lambda row: to_chembl_id(row.rdkit_canonical_smiles, 
                                                                           row.rdkit_canonical_no_stereo_smiles,
                                                                           row.Name), 
                                                        axis='columns', result_type='expand')

    # SMILES may map to more than one 
    oral_drugs = oral_drugs.explode(['chembl_parent_molregno', 'chembl_parent_id', 'chembl_parent_pref_name', 
                "chembl_salt_smiles", "chembl_parent_smiles"])
    oral_drugs = oral_drugs.astype({'chembl_parent_molregno': 'Int64'})
    
    # map to targets the compound is known to interact with based on the drug_mechanism table
    print('map to targets the compound is known to interact with based on the drug_mechanism table')
    oral_drugs[["in_dm_table", 
            "chembl_target_name", 
            "chembl_tid", 
            "chembl_target_id"]] = oral_drugs.apply(lambda row: in_dm_table(row.chembl_parent_molregno), 
                                                      axis='columns', result_type='expand')
    # compound may appear more than once in drug_mechanism table
    oral_drugs = oral_drugs.explode(['chembl_target_name', 'chembl_tid', 'chembl_target_id'])
    
    # add target class annotations based on target ids
    print('add target class annotations based on target ids')
    oral_drugs = oral_drugs.merge(target_classes_level1[['tid', 'target_class_l1']], left_on='chembl_tid', right_on='tid', how = 'left').drop(columns=['tid'])
    oral_drugs = oral_drugs.merge(target_classes_level2, left_on='chembl_tid', right_on='tid', how = 'left').drop(columns=['tid'])
    
    # add compound properties based on mapped ChEMBL ID
    oral_drugs = oral_drugs.merge(df_cpd_props, left_on='chembl_parent_molregno', right_on = 'parent_molregno', how = 'left').drop(columns=['parent_molregno'])
    
    # add RDKit-based properties
    print('add RDKit-based properties')
    oral_drugs = add_RDKit_columns(oral_drugs)
    oral_drugs = add_aromaticity_columns(oral_drugs)
    return oral_drugs

Calculate the results for all three excel sheets.

In [20]:
for sheet_name, smiles_column in zip(sheet_names, smiles_columns):
    oral_drugs = pd.read_excel(oral_drugs_path, sheet_name=sheet_name)
    print(sheet_name)
    print('{:10} {:4}'.format('#Rows:', len(oral_drugs)))
    print('{:10} {:4}'.format('#SMILES:', len(set(oral_drugs[smiles_column]))))
    print('{:10} {:4}'.format('#Rows w/o SMILES:', len(oral_drugs[oral_drugs[smiles_column].isnull()])))
    results.append(get_result(sheet_name, smiles_column))
    print('----------------')

All
#Rows:     2059
#SMILES:   2057
#Rows w/o SMILES:    0
read file
map smiles to canonical smiles


100%|█████████████████████████████████████| 2057/2057 [00:00<00:00, 7333.41it/s]


#Problems: 0
map compound to chembl_id and pref_name
NC[C@@H]1O[C@H](O[C@@H]2[C@@H](CO)O[C@@H](OC3[C@@H](O)[C@H](N)C[C@H](N)[C@H]3O[C@H]3O[C@H](CN)[C@@H](O)[C@H](O)[C@H]3N)[C@@H]2O)[C@H](N)[C@@H](O)[C@@H]1O [2133, 2059669] ['CHEMBL266347', 'CHEMBL3754093'] [None, None]
CN(C)CCC=C1c2ccccc2CSc2ccccc21 [176206, 916384] ['CHEMBL108947', 'CHEMBL1492500'] ['DOTHIEPIN', 'DOTHIEPIN']
CCOC(O)=Nc1c[n+](N2CCOCC2)no1 [705339, 753339] ['CHEMBL1256353', 'CHEMBL1329455'] ['MOLSIDOMINE', 'MOLSIDOMINE']
CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]2O[C@H](C)C[C@H](N(C)C)[C@H]2O)[C@](C)(O)C[C@@H](C)[C@@H]2N[C@@H](COCCOC)O[C@H]([C@H]2C)[C@]1(C)O [699423, 1592294] ['CHEMBL1237072', 'CHEMBL3039471'] ['DIRITHROMYCIN', 'DIRITHROMYCIN']
Cc1oc(=O)oc1COC(=O)[C@@H]1N2C(=O)[C@@H](NC(=O)[C@H](N)c3ccccc3)[C@H]2SC1(C)C [1378599, 1946697] ['CHEMBL2106329', 'CHEMBL3580454'] ['LENAMPICILLIN', 'LENAMPICILLIN']
C=CC[C@@H]1/C=C(\C)C[C@H](C)C[C@H](OC)[C@H]2O[C@@](O)(C(=O)C(=O)N

100%|█████████████████████████████████████| 2057/2057 [00:00<00:00, 5931.15it/s]


----------------
Additions info
#Rows:       25
#SMILES:     25
#Rows w/o SMILES:    0
read file
map smiles to canonical smiles


100%|█████████████████████████████████████████| 25/25 [00:00<00:00, 5149.17it/s]

#Problems: 0
map compound to chembl_id and pref_name





map to targets the compound is known to interact with based on the drug_mechanism table
add target class annotations based on target ids
add RDKit-based properties


100%|█████████████████████████████████████████| 25/25 [00:00<00:00, 4240.10it/s]


----------------
Post90 with targets annotated
#Rows:      437
#SMILES:    437
#Rows w/o SMILES:    0
read file
map smiles to canonical smiles


100%|███████████████████████████████████████| 437/437 [00:00<00:00, 5563.26it/s]


#Problems: 0
map compound to chembl_id and pref_name
map to targets the compound is known to interact with based on the drug_mechanism table
add target class annotations based on target ids
add RDKit-based properties


100%|███████████████████████████████████████| 437/437 [00:00<00:00, 4480.63it/s]

----------------





## Postprocessing (Types, Rounding)

In [21]:
for i in range(len(results)):
    results[i] = results[i].where(pd.notnull(results[i]), None)
    results[i] = results[i].astype({
        'in_dm_table': 'bool', 
        'chembl_tid': 'Int64', 
        'num_aliphatic_carbocycles': 'Int64',
        'num_aliphatic_heterocycles': 'Int64',
        'num_aliphatic_rings': 'Int64',
        'num_aromatic_carbocycles': 'Int64',
        'num_aromatic_heterocycles': 'Int64',
        'num_aromatic_rings': 'Int64',
        'num_heteroatoms': 'Int64',
        'num_saturated_carbocycles': 'Int64',
        'num_saturated_heterocycles': 'Int64',
        'num_saturated_rings': 'Int64',
        'ring_count': 'Int64',
        'num_stereocentres': 'Int64',
        'aromatic_atoms': 'Int64',
        'aromatic_c': 'Int64',
        'aromatic_n': 'Int64',
        'aromatic_hetero': 'Int64',
        'aromatic_hetero': 'Int64',
        'hba': 'Int64',
        'hbd': 'Int64',
        'rtb': 'Int64',
        'num_ro5_violations': 'Int64',
        'aromatic_rings': 'Int64',
        'heavy_atoms': 'Int64',
        'hba_lipinski': 'Int64',
        'hbd_lipinski': 'Int64',
        'num_lipinski_ro5_violations': 'Int64'
    })

In [22]:
# Round float columns to 4 decimal places
# for result, existing_cols in zip(results, existing_columns):
for i in range(len(results)):
    existing_cols = existing_columns[i]
    decimal_places = 4
    for col_id, (col, dtype) in enumerate(results[i].dtypes.to_dict().items()):
        if col_id > existing_cols:
            if ((dtype == 'float64') or (dtype == 'Float64')):
                results[i][col] = results[i][col].round(decimals=decimal_places)

## Sanity Checks

Check that columns have expected types (by hand).

In [23]:
for result, existing_cols in zip(results, existing_columns):
    print("{:3} {:50} {:10} {}".format("", "column", "type", "#null values"))
    for i, (col, dtype) in enumerate(result.dtypes.to_dict().items()):
        if i > existing_cols:
            print("{:3} {:50} {:10} {}".format(i, col, str(dtype), len(result[result[col].isnull()])))
    print()

    column                                             type       #null values
 12 rdkit_canonical_smiles                             object     0
 13 rdkit_canonical_no_stereo_smiles                   object     0
 14 chembl_parent_molregno                             Int64      20
 15 chembl_parent_id                                   object     20
 16 chembl_parent_pref_name                            object     87
 17 chembl_salt_smiles                                 object     20
 18 chembl_parent_smiles                               object     20
 19 found_type                                         object     20
 20 in_dm_table                                        bool       0
 21 chembl_target_name                                 object     786
 22 chembl_tid                                         Int64      786
 23 chembl_target_id                                   object     786
 24 target_class_l1                                    object     848
 25 target_class_l2    

Check if there are mixed types in columns with dtype=object.

In [24]:
# check that there are no mixed types in object columns
for result, existing_cols, sheet_name in zip(results, existing_columns, sheet_names):
    print(sheet_name)
    issue_ctr = 0
    for i, (col, dtype) in enumerate(result.dtypes.to_dict().items()):
        if i > existing_cols:
            if dtype == object:
                a = set(result[col])
                b = set(result[col].astype(str))
                x = a-b
                y = b-a
                # is there a difference in the two sets
                if(len(x) > 0 or len(y) > 0):
                    if not ((len(x.difference({None})) == 0 and len(y.difference({'None'})) == 0)) and \
                        not ((len(x.difference({np.nan})) == 0 and len(y.difference({'nan'})) == 0)):
                        print("Mixed types in column ", col)
                        print(a-b, '/', b-a)
                        issue_ctr += 1

    print("#Problems:", issue_ctr, '\n')

All
#Problems: 0 

Additions info
#Problems: 0 

Post90 with targets annotated
#Problems: 0 



Check if any columns contain nan or null which aren't recognised as null values. 

In [25]:
# Do any columns have potential issues with null types?
for result, existing_cols, sheet_name in zip(results, existing_columns, sheet_names):
    print(sheet_name)
    issue_ctr = 0
    for i, (col, dtype) in enumerate(result.dtypes.to_dict().items()):
        if 'nan' in set(result[result[col].notnull()][col].astype(str)):
            print("Issue with nan in column", col)
            issue_ctr += 1
        if 'null' in set(result[result[col].notnull()][col].astype(str)):
            print("Issue with null in column", col)
            issue_ctr += 1

    print("#Problems:", issue_ctr, '\n')

All
#Problems: 0 

Additions info
#Problems: 0 

Post90 with targets annotated
#Problems: 0 



## Writing Results

Write full dataset to file.

In [26]:
# Write results to new sheet
dataset_all_name = path_results+'Oral_drugs_chembl.xlsx'
with pd.ExcelWriter(dataset_all_name,engine='xlsxwriter') as writer: 
    for result, sheet_name, num_cols in zip(results, sheet_names, existing_columns):
        columns = list(result.columns[:num_cols+1]) + \
            ['rdkit_canonical_smiles', 
             'chembl_parent_molregno', 'chembl_parent_id', 'chembl_parent_pref_name',
             'chembl_salt_smiles', 'chembl_parent_smiles',
             'in_dm_table', 'chembl_target_name', 'chembl_tid', 'chembl_target_id',
             'target_class_l1', 'target_class_l2', 
             'mw_freebase', 'alogp', 'hba',
             'hbd', 'psa', 'rtb', 'ro3_pass', 'num_ro5_violations', 'cx_most_apka',
             'cx_most_bpka', 'cx_logp', 'cx_logd', 'molecular_species', 'full_mwt',
             'aromatic_rings', 'heavy_atoms', 'qed_weighted', 'mw_monoisotopic',
             'full_molformula', 'hba_lipinski', 'hbd_lipinski',
             'num_lipinski_ro5_violations',
             'fraction_csp3',
             'num_aliphatic_carbocycles', 'num_aliphatic_heterocycles', 'num_aliphatic_rings', 
             'num_aromatic_carbocycles', 'num_aromatic_heterocycles', 'num_aromatic_rings', 
             'num_heteroatoms', 'num_saturated_carbocycles', 'num_saturated_heterocycles',
             'num_saturated_rings', 'ring_count', 'num_stereocentres',
             'scaffold_w_stereo', 'scaffold_wo_stereo', 
             'aromatic_atoms', 'aromatic_c', 'aromatic_n', 'aromatic_hetero']
        result[columns].to_excel(writer, sheet_name=sheet_name, index = False)

Write dataset restricted to RDKit-based properties to file.

In [27]:
# Write restricted results to new sheet
dataset_restricted_name = path_results+'Oral_drugs_chembl_restricted.xlsx'
with pd.ExcelWriter(dataset_restricted_name) as writer: 
    for result, sheet_name, num_cols in zip(results, sheet_names, existing_columns):
        columns = list(result.columns[:num_cols+1]) + \
            ['fraction_csp3',
             'num_aliphatic_carbocycles', 'num_aliphatic_heterocycles', 'num_aliphatic_rings', 
             'num_aromatic_carbocycles', 'num_aromatic_heterocycles', 'num_aromatic_rings', 
             'num_heteroatoms', 'num_saturated_carbocycles', 'num_saturated_heterocycles',
             'num_saturated_rings', 'ring_count', 'num_stereocentres',
             'scaffold_w_stereo', 'scaffold_wo_stereo', 
             'aromatic_atoms', 'aromatic_c', 'aromatic_n', 'aromatic_hetero']
        result[columns].drop_duplicates().to_excel(writer, sheet_name=sheet_name, index = False)

# Write Instances With Difference Between Results and Chembl

Write dataset restricted to mapped canonical SMILES.

Combine relevant information of the three sheets into one table.

In [28]:
res0 = results[0][['Name', 'SMILES', 'Publication Date', 'Publication decade', 'Decade', 'Formula',
                   'rdkit_canonical_smiles', 'chembl_parent_molregno', 'chembl_parent_id', 'chembl_parent_pref_name']]
res1 = results[1][['Name', 'Target', 'Target class', 'Ref', 'Patent', 'Smiles', 
                   'rdkit_canonical_smiles', 'chembl_parent_molregno', 'chembl_parent_id', 'chembl_parent_pref_name'
                  ]].rename(columns={'Smiles': 'SMILES', 'Target': 'Target (Paul)', 'Target class': 'Target class (Paul)'})
res2 = results[2][['Name', 'SMILES', 'Publication Date', 'Decade',
                   'Target (to be checked)', 'Target class (to be checked)', 
                   'rdkit_canonical_smiles', 'chembl_parent_molregno', 'chembl_parent_id', 'chembl_parent_pref_name'
                  ]].rename(columns={'Target (to be checked)': 'Target (Paul)', 'Target class (to be checked)': 'Target class (Paul)'})

df_combined = pd.concat([res0, res1, res2], sort=False)
df_combined = df_combined[['Name', 'SMILES', 
                           'Publication Date', 'Publication decade', 'Decade',
                           'Formula', 'Target (Paul)', 'Target class (Paul)', 'Ref', 'Patent', 
                           'rdkit_canonical_smiles', 'chembl_parent_molregno',
                           'chembl_parent_id', 'chembl_parent_pref_name'
                          ]].drop_duplicates().sort_values(by=['SMILES']).reset_index(drop=True)
df_combined = df_combined.where(pd.notnull(df_combined), None)
df_combined

Unnamed: 0,Name,SMILES,Publication Date,Publication decade,Decade,Formula,Target (Paul),Target class (Paul),Ref,Patent,rdkit_canonical_smiles,chembl_parent_molregno,chembl_parent_id,chembl_parent_pref_name
0,Lonafarnib,BrC1=CC2=C(N=C1)[C@H](C3CCN(C(CC4CCN(C(N)=O)CC4)=O)CC3)C5=C(C=C(Cl)C=C5Br)CC2,1997.0,1990s,1990s,C27H31Br2ClN4O2,,,,,NC(=O)N1CCC(CC(=O)N2CCC([C@H]3c4ncc(Br)cc4CCc4cc(Cl)cc(Br)c43)CC2)CC1,84655,CHEMBL298734,LONAFARNIB
1,Lonafarnib,BrC1=CC2=C(N=C1)[C@H](C3CCN(C(CC4CCN(C(N)=O)CC4)=O)CC3)C5=C(C=C(Cl)C=C5Br)CC2,1997.0,,1990s,,Farnesyl transferase,Other,,,NC(=O)N1CCC(CC(=O)N2CCC([C@H]3c4ncc(Br)cc4CCc4cc(Cl)cc(Br)c43)CC2)CC1,84655,CHEMBL298734,LONAFARNIB
2,Pipobroman,BrCCC(=O)N1CCN(CC1)C(=O)CCBr,1960.0,1960s,1960s,C10H16Br2N2O2,,,,,O=C(CCBr)N1CCN(C(=O)CCBr)CC1,453722,CHEMBL1585,PIPOBROMAN
3,Bromazepam,Brc1ccc2NC(=O)CN=C(c3ccccn3)c2c1,1962.0,1960s,1960s,C14H10BrN3O,,,,,O=C1CN=C(c2ccccn2)c2cc(Br)ccc2N1,16960,CHEMBL277062,BROMAZEPAM
4,Norgestrel,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3CC[C@@]21CC,1963.0,1960s,1960s,C21H28O2,,,,,C#CC1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3CC[C@@]21CC,1380067,CHEMBL2107797,NORGESTREL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522,Levamisole,c1ccc([C@H]2CN3CCSC3=N2)cc1,1967.0,1960s,1960s,C11H12N2S,,,,,c1ccc([C@H]2CN3CCSC3=N2)cc1,384948,CHEMBL1454,LEVAMISOLE
2523,BENZHEXOL,c1ccc(cc1)C(CCN2CCCCC2)(C3CCCCC3)O,1949.0,1950 -,1940s,C20H31NO,,,,,OC(CCN1CCCCC1)(c1ccccc1)C1CCCCC1,418411,CHEMBL1490,TRIHEXYPHENIDYL
2524,Gosogliptin,c1cnc(nc1)N2CCN(CC2)[C@H]3C[C@H](NC3)C(=O)N4CCC(C4)(F)F,2005.0,,2000s,,DPP-4,Protease,,,O=C([C@@H]1C[C@H](N2CCN(c3ncccn3)CC2)CN1)N1CCC(F)(F)C1,460014,CHEMBL515387,GOSOGLIPTIN
2525,Gosogliptin,c1cnc(nc1)N2CCN(CC2)[C@H]3C[C@H](NC3)C(=O)N4CCC(C4)(F)F,2005.0,2000s,2000s,C17H24F2N6O,,,,,O=C([C@@H]1C[C@H](N2CCN(c3ncccn3)CC2)CN1)N1CCC(F)(F)C1,460014,CHEMBL515387,GOSOGLIPTIN


Identify SMILES which have contradictory information (including being mapped to more than one ChEMBL ID) content in the different sheets. These row won't be combined into a single row in the next steps.

In [29]:
problem_smiles = set()
more_than_one_chembl_id = set()

for smiles in set(df_combined['SMILES']):
    if len(df_combined[df_combined['SMILES'] == smiles]) > 1:
        display_smiles = True
        for col in ['Name', 'Publication Date', 'Publication decade', 'Decade', 'Formula', 
                    'Target (Paul)', 'Target class (Paul)', 'Ref', 'Patent',
                    'rdkit_canonical_smiles', 'chembl_parent_molregno', 'chembl_parent_id', 'chembl_parent_pref_name']:
            # more than one not-null value for a column
            if len({x for x in set(df_combined[df_combined['SMILES'] == smiles][col]) if pd.notna(x)}) > 1:
                problem_smiles.add(smiles)
                if col == 'chembl_parent_molregno':
                    more_than_one_chembl_id.add(smiles)
                
                # output if there is a mismatch other than smiles mapping to more than one ChEMBL ID
                if col != 'chembl_parent_molregno' and col != 'chembl_parent_id' and col != 'chembl_parent_pref_name':
                    if display_smiles:
                        print(smiles)
                        print("#rows:", len(df_combined[df_combined['SMILES'] == smiles]))
                    print("{:25} {}".format(col, 
                                               set({x for x in set(df_combined[df_combined['SMILES'] == smiles][col]) if pd.notna(x)})))
                    display_smiles = False
                
        if not display_smiles:
            print()

C[C@H](C(=O)O)c1cccc(C(=O)c2ccccc2)c1
#rows: 2
Name                      {'Dexketoprofen', 'Ketoprofen'}

CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1
#rows: 2
Name                      {'Levosalbutamol', 'Albuterol'}
Publication Date          {1962.0, 1971.0}
Publication decade        {'1970s', '1960s'}
Decade                    {'1970s', '1960s'}



Combine rows originating from different tables which have no contradictory information.

In [30]:
df_combined_agg = df_combined[~df_combined['SMILES'].isin(problem_smiles)].groupby('SMILES', as_index=False).first()
df_combined_problem = df_combined[df_combined['SMILES'].isin(problem_smiles)]
df_combined = pd.concat([df_combined_agg, df_combined_problem], sort=False).sort_values(by=['Publication Date']).reset_index(drop=True)
df_combined

Unnamed: 0,SMILES,Name,Publication Date,Publication decade,Decade,Formula,Target (Paul),Target class (Paul),Ref,Patent,rdkit_canonical_smiles,chembl_parent_molregno,chembl_parent_id,chembl_parent_pref_name
0,O[C@H](C(O)=O)[C@@H](C(O)=O)O,Tartaric acid,1837.0,1950 -,1930 -,C4H6O6,,,,,O=C(O)[C@@H](O)[C@H](O)C(=O)O,674812,CHEMBL1200861,TARTARIC ACID
1,O=[N+]([O-])OCC(CO[N+](=O)[O-])O[N+](=O)[O-],Nitroglycerin,1855.0,1950 -,1930 -,C3H5N3O9,,,,,O=[N+]([O-])OCC(CO[N+](=O)[O-])O[N+](=O)[O-],37493,CHEMBL730,NITROGLYCERIN
2,OC(O)C(Cl)(Cl)Cl,Chloral,1869.0,1950 -,1930 -,C2H3Cl3O2,,,,,OC(O)C(Cl)(Cl)Cl,453652,CHEMBL455917,CHLORAL HYDRATE
3,CNC(Oc1cc([C@@]2(CCN([C@@H]2N3C)C)C)c3cc1)=O,Physostigmine,1872.0,1950 -,1930 -,C15H21N3O2,,,,,CNC(=O)Oc1ccc2c(c1)[C@]1(C)CCN(C)[C@@H]1N2C,9277,CHEMBL94,PHYSOSTIGMINE
4,N[C@@H](CCC(=O)O)C(=O)O,Glutamic acid,1872.0,1950 -,1930 -,C5H9NO4,,,,,N[C@@H](CCC(=O)O)C(=O)O,573408,CHEMBL575060,GLUTAMIC ACID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2061,CC(C)(C#Cc1ccc(-c2ccc(Cl)c3c(NS(C)(=O)=O)nn(CC(F)(F)F)c23)c([C@H](Cc2cc(F)cc(F)c2)NC(=O)Cn2nc(C(...,Lenacapavir,2018.0,2010s,2010s,C39H32ClF10N7O5S2,HIV capsid,Other,,,CC(C)(C#Cc1ccc(-c2ccc(Cl)c3c(NS(C)(=O)=O)nn(CC(F)(F)F)c23)c([C@H](Cc2cc(F)cc(F)c2)NC(=O)Cn2nc(C(...,2464869,CHEMBL4594438,LENACAPAVIR
2062,CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=CC=C5)Cl)C(=N2)N6CCN([C@H](C6)CC#N)C(=O)C(=C)F,Adagrasib,2019.0,2010s,2010s,C32H35ClFN7O2,KRAS G12C,Other,FDA 2022,2019.0,C=C(F)C(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N,2464781,CHEMBL4594350,ADAGRASIB
2063,CC(C(OC[C@@H]1[C@@H](O)[C@@H](O)[C@H](N2C=CC(NO)=NC2=O)O1)=O)C,Molnupiravir,2019.0,2010s,2010s,C13H19N3O7,RNA polymerase,Other,,,CC(C)C(=O)OC[C@H]1O[C@@H](n2ccc(NO)nc2=O)[C@H](O)[C@@H]1O,2486580,CHEMBL4650320,MOLNUPIRAVIR
2064,CC1(C)[C@@H]2[C@H]1[C@@H](C(N[C@H](C#N)C[C@@H]3CCNC3=O)=O)N(C([C@@H](NC(C(F)(F)F)=O)C(C)(C)C)=O)C2,Nirmatrelvir,2021.0,2010s,2010s,C23H32F3N5O4,SARS-CoV-2-3CL protease,Protease,,,CC(C)(C)[C@H](NC(=O)C(F)(F)F)C(=O)N1C[C@H]2[C@@H]([C@H]1C(=O)N[C@H](C#N)C[C@@H]1CCNC1=O)C2(C)C,2537386,CHEMBL4802135,NIRMATRELVIR


Write compounds without a mapping to ChEMBL (based on canonical SMILES) to file. 

In [31]:
missing_cpds = path_results+'Missing_oral_drugs.xlsx'
missing_in_chembl = df_combined[df_combined['chembl_parent_molregno'].isnull()][[
    'Name', 'SMILES', 'Publication Date', 'Publication decade', 'Decade',
    'Formula', 'Target (Paul)', 'Target class (Paul)', 'Ref', 'Patent',
    'rdkit_canonical_smiles']]
print("Smiles with contradictory information: ", len(missing_in_chembl[missing_in_chembl['SMILES'].isin(problem_smiles)]))
missing_in_chembl.to_excel(missing_cpds, index = False)

Smiles with contradictory information:  0


Write compounds with multiple ChEMBL IDs per SMILES to file.

In [32]:
more_than_one_ID = path_results+'Multiple_mappings_oral_drugs.xlsx'
multiple_mapped = df_combined[df_combined['SMILES'].isin(more_than_one_chembl_id)]
multiple_mapped.to_excel(more_than_one_ID, index = False)

Sanity check: check that the compounds with multiple ChEMBL ID mappings are really parent_molregnos.

In [33]:
sql = '''
SELECT DISTINCT mh.molregno, mh.parent_molregno
FROM molecule_hierarchy mh
'''

df_mol_hierarchy = pd.read_sql_query(sql, con=chembl_con)

for smiles in more_than_one_chembl_id:
    for chembl_molregno in set(df_combined[df_combined['SMILES'] == smiles]['chembl_parent_molregno']):
        parent_molregno = (df_mol_hierarchy[df_mol_hierarchy['molregno'] == chembl_molregno]['parent_molregno']).item()
        print("{:10} and {:10} are identical: {}".format(chembl_molregno, parent_molregno, chembl_molregno == parent_molregno))

       924 and        924 are identical: True
   2435351 and    2435351 are identical: True
   1946697 and    1946697 are identical: True
   1378599 and    1378599 are identical: True
   1592294 and    1592294 are identical: True
    699423 and     699423 are identical: True
      2133 and       2133 are identical: True
   2059669 and    2059669 are identical: True
    705339 and     705339 are identical: True
    753339 and     753339 are identical: True
    916384 and     916384 are identical: True
    176206 and     176206 are identical: True


# Sanity Checks

Check that output files can be written and read and are identical to the results.

In [34]:
for read_filename in [dataset_all_name, dataset_restricted_name]:
    print(read_filename)
    for result, sheet_name in zip(results, sheet_names):
        result_copy = result.copy().drop(columns=['found_type', 'rdkit_canonical_no_stereo_smiles'])
        result_copy = result_copy.replace('', None).reset_index(drop=True)

        read_file = pd.read_excel(read_filename, sheet_name=sheet_name)
        read_file = read_file.where(pd.notnull(read_file), None)
        read_file = read_file.replace('', None).reset_index(drop=True)

        if read_filename == dataset_all_name:
            read_file = read_file.astype({
                'chembl_parent_molregno': 'Int64', 
                'in_dm_table': 'bool', 
                'chembl_tid': 'Int64', 
                'hba': 'Int64',
                'hbd': 'Int64',
                'rtb': 'Int64',
                'num_ro5_violations': 'Int64',
                'aromatic_rings': 'Int64',
                'heavy_atoms': 'Int64',
                'hba_lipinski': 'Int64',
                'hbd_lipinski': 'Int64',
                'num_lipinski_ro5_violations': 'Int64'
            })
        else: #read_filename == dataset_restricted_name
            result_copy = result_copy[read_file.columns].drop_duplicates().reset_index(drop=True)
        
        read_file = read_file.astype({
                'num_aliphatic_carbocycles': 'Int64',
                'num_aliphatic_heterocycles': 'Int64',
                'num_aliphatic_rings': 'Int64',
                'num_aromatic_carbocycles': 'Int64',
                'num_aromatic_heterocycles': 'Int64',
                'num_aromatic_rings': 'Int64',
                'num_heteroatoms': 'Int64',
                'num_saturated_carbocycles': 'Int64',
                'num_saturated_heterocycles': 'Int64',
                'num_saturated_rings': 'Int64',
                'ring_count': 'Int64',
                'num_stereocentres': 'Int64',
                'aromatic_atoms': 'Int64',
                'aromatic_c': 'Int64',
                'aromatic_n': 'Int64',
                'aromatic_hetero': 'Int64'
        })
        
        print("{:40} file is ok: {}".format(sheet_name, read_file.equals(result_copy)))
    print("----------")

/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/results/Oral_drugs_chembl.xlsx
All                                      file is ok: True
Additions info                           file is ok: True
Post90 with targets annotated            file is ok: True
----------
/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/results/Oral_drugs_chembl_restricted.xlsx
All                                      file is ok: True
Additions info                           file is ok: True
Post90 with targets annotated            file is ok: True
----------


Check that output files can be written and read and are identical to reading the original dataset.

In [35]:
for read_filename in [dataset_all_name, dataset_restricted_name]:
    print(read_filename)
    for result, sheet_name in zip(results, sheet_names):
        original_file = pd.read_excel(oral_drugs_path, sheet_name=sheet_name)

        read_file = pd.read_excel(read_filename, sheet_name=sheet_name)
        read_file = read_file[original_file.columns].drop_duplicates().reset_index(drop=True)

        print("{:40} file is ok: {}".format(sheet_name, read_file.equals(original_file)))
    print("----------")

/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/results/Oral_drugs_chembl.xlsx
All                                      file is ok: True
Additions info                           file is ok: True
Post90 with targets annotated            file is ok: True
----------
/Users/heinzke/Documents/PhD/Projects/drug_target_dataset_curation/results/Oral_drugs_chembl_restricted.xlsx
All                                      file is ok: True
Additions info                           file is ok: True
Post90 with targets annotated            file is ok: True
----------
