<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1">Load Data</a></span></li><li><span><a href="#Process-Data" data-toc-modified-id="Process-Data-2">Process Data</a></span><ul class="toc-item"><li><span><a href="#Convert-SMILES-Data" data-toc-modified-id="Convert-SMILES-Data-2.1">Convert SMILES Data</a></span></li><li><span><a href="#Calulate-Lepinski-Descriptors" data-toc-modified-id="Calulate-Lepinski-Descriptors-2.2">Calulate Lepinski Descriptors</a></span></li><li><span><a href="#Calculate-Additional-Molecular-Descriptors" data-toc-modified-id="Calculate-Additional-Molecular-Descriptors-2.3">Calculate Additional Molecular Descriptors</a></span></li></ul></li><li><span><a href="#Visua" data-toc-modified-id="Visua-3">Visua</a></span></li></ul></div>

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
plt.style.use('ggplot')

# Load Data 

In [4]:
data = pd.read_csv('Data/hitdata.csv')
data.head(10)

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID
0,CHEMBL231779,APIXABAN,4,459.51,0,2.7,Apixaban,COc1ccc(-n2nc(C(N)=O)c3c2C(=O)N(c2ccc(N4CCCCC4...,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
1,CHEMBL186,CEFEPIME,4,480.57,0,-1.28,cefepime,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N...,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
2,CHEMBL240597,CHENODIOL,4,392.58,0,4.48,chenodeoxycholic-acid,C[C@H](CCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C...,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
3,CHEMBL1098,BUPIVACAINE,4,288.44,0,3.9,bupivacaine,CCCCN1CCCCC1C(=O)Nc1c(C)cccc1C,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
4,CHEMBL345714,ACIPIMOX,4,154.12,0,-0.28,Acipimox,Cc1cnc(C(=O)O)c[n+]1[O-],Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
5,CHEMBL498,CHLORPROPAMIDE,4,276.75,0,1.74,Chlorpropamide,CCCNC(=O)NS(=O)(=O)c1ccc(Cl)cc1,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
6,CHEMBL475534,NITRENDIPINE,4,360.37,0,2.57,nitrendipine,CCOC(=O)C1=C(C)NC(C)=C(C(=O)OC)C1c1cccc([N+](=...,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
7,CHEMBL1625,OXYBENZONE,4,228.25,0,2.63,Oxybenzone,COc1ccc(C(=O)c2ccccc2)c(O)c1,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
8,CHEMBL2359059,,0,403.95,0,3.99,Propiverine hydrochloride,CCCOC(C(=O)OC1CCN(C)CC1)(c1ccccc1)c1ccccc1.Cl,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839
9,CHEMBL421,SULFASALAZINE,4,398.4,0,3.7,Sulfasalazine,O=C(O)c1cc(/N=N/c2ccc(S(=O)(=O)Nc3ccccn3)cc2)c...,Hit score,'=',...,CHEMBL4303835,SARS-CoV-2,Severe acute respiratory syndrome coronavirus 2,ORGANISM,CHEMBL4303122,52,SARS-CoV-2 Screening Data,,2020,CHEMBL4303839


# Process Data

In [6]:
data = data.drop(columns = ['Document Year', 'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI',
       'Potential Duplicate'])

In [7]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL231779,APIXABAN,4,459.51,0,2.7,Apixaban,COc1ccc(-n2nc(C(N)=O)c3c2C(=O)N(c2ccc(N4CCCCC4...,Hit score,'=',0.2748,
1,CHEMBL186,CEFEPIME,4,480.57,0,-1.28,cefepime,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N...,Hit score,'=',0.1073,
2,CHEMBL240597,CHENODIOL,4,392.58,0,4.48,chenodeoxycholic-acid,C[C@H](CCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C...,Hit score,'=',0.2031,
3,CHEMBL1098,BUPIVACAINE,4,288.44,0,3.9,bupivacaine,CCCCN1CCCCC1C(=O)Nc1c(C)cccc1C,Hit score,'=',0.1503,
4,CHEMBL345714,ACIPIMOX,4,154.12,0,-0.28,Acipimox,Cc1cnc(C(=O)O)c[n+]1[O-],Hit score,'=',0.138,


## Convert SMILES Data

In [75]:
molecules = []
for smile in data['Smiles']:
    try:
        molecules.append(Chem.MolFromSmiles(smile))
    except:
        pass  

## Calulate Lepinski Descriptors

In [90]:
def lipinski(molecules, verbose=False):
    """
    Calculates the Lipinski rule of 5 descriptors for a set of molecules
    
    Parameters
    ----------
    molecules : array_like
        array of rdkit molecule objects
        
    Returns
    -------
    DataFrame
    """
    lipinski_ = {"MW": Descriptors.MolWt,"LogP": Descriptors.MolLogP,"NumHDonors": Lipinski.NumHDonors,"NumHAcceptors": Lipinski.NumHAcceptors}
    result = [{item[0]: item[1](molecule) for item in lipinski_.items()} for molecule in molecules]
    return pd.DataFrame(result)

In [91]:
lipinski(molecules)

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,459.506,2.69960,1,6
1,480.572,-1.27990,2,10
2,392.580,4.47790,3,3
3,288.435,3.89654,1,2
4,154.125,-0.27838,1,3
...,...,...,...,...
1654,357.793,3.92732,1,4
1655,191.139,-1.27820,1,6
1656,183.207,0.35060,4,4
1657,455.551,4.40250,1,7


## Calculate Additional Molecular Descriptors

In [112]:
def get_SAmapping(molecule) -> int:
    """
    Wrapper function to get topological surface area mapping for a given molecule
    Parameters
    ----------
    molecule: rdkit molecule object
    Returns
    -------
    int
    """
    return Chem.QED.properties(molecule).PSA
    
    
def molecular_properties(molecules, verbose = False):
    """
    Calculates the following attributes of a set of molecules
    rotatable_bonds
    number_of_atoms
    molar_refractivity 
    topological_surface_area_mapping
    formal_charge 
    heavy_atoms 
    num_of_rings 

    Parameters
    ----------
    molecules : array_like
    array of rdkit molecule objects

    Returns
    -------
    DataFrame
    """
    properties = {'rotatable_bonds' : Descriptors.NumRotatableBonds, 'number_of_atoms': Chem.rdchem.Mol.GetNumAtoms,
                 'molar_refractivity': Chem.Crippen.MolMR, "SA_mapping" : get_SAmapping,
                 'formal_charge': Chem.rdmolops.GetFormalCharge, 'heavy_atoms': Chem.rdchem.Mol.GetNumHeavyAtoms, 
                  'num_of_rings' :Chem.rdMolDescriptors.CalcNumRings}
    result = [{prop[0]: prop[1](molecule) for prop in properties.items()} for molecule in molecules]
    
    return pd.DataFrame(result)

In [114]:
molecular_properties(molecules)

Unnamed: 0,rotatable_bonds,number_of_atoms,molar_refractivity,SA_mapping,formal_charge,heavy_atoms,num_of_rings
0,5,34,126.6604,110.76,0,34,5
1,7,32,117.1475,150.04,0,32,4
2,4,28,108.6474,77.76,0,28,4
3,5,21,88.6657,32.34,0,21,2
4,1,11,34.8933,77.13,0,11,1
...,...,...,...,...,...,...,...
1654,4,25,95.7473,68.53,0,25,3
1655,2,13,37.3542,91.06,0,13,2
1656,3,13,48.6581,72.72,0,13,1
1657,7,33,125.8637,90.93,0,33,2


In [115]:
data

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units
0,CHEMBL231779,APIXABAN,4,459.51,0,2.70,Apixaban,COc1ccc(-n2nc(C(N)=O)c3c2C(=O)N(c2ccc(N4CCCCC4...,Hit score,'=',0.2748,
1,CHEMBL186,CEFEPIME,4,480.57,0,-1.28,cefepime,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N...,Hit score,'=',0.1073,
2,CHEMBL240597,CHENODIOL,4,392.58,0,4.48,chenodeoxycholic-acid,C[C@H](CCC(=O)O)[C@H]1CC[C@H]2[C@H]3[C@H](CC[C...,Hit score,'=',0.2031,
3,CHEMBL1098,BUPIVACAINE,4,288.44,0,3.90,bupivacaine,CCCCN1CCCCC1C(=O)Nc1c(C)cccc1C,Hit score,'=',0.1503,
4,CHEMBL345714,ACIPIMOX,4,154.12,0,-0.28,Acipimox,Cc1cnc(C(=O)O)c[n+]1[O-],Hit score,'=',0.1380,
...,...,...,...,...,...,...,...,...,...,...,...,...
1664,CHEMBL6,INDOMETHACIN,4,357.79,0,3.93,Indomethacin,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c1ccc(Cl)cc1,Hit score,'=',0.1032,
1665,CHEMBL1311,ISOSORBIDE MONONITRATE,4,191.14,0,-1.28,Isosorbide mononitrate,O=[N+]([O-])O[C@@H]1CO[C@@H]2[C@@H](O)CO[C@H]12,Hit score,'=',0.1018,
1666,CHEMBL42280,L-ADRENALINE,0,183.21,0,0.35,L-Adrenaline,CNC[C@@H](O)c1ccc(O)c(O)c1,Hit score,'=',-0.1051,
1667,CHEMBL460291,LACIDIPINE,4,455.55,0,4.40,"LACIDIPINE (LACIPIL, MOTENS)",CCOC(=O)C1=C(C)NC(C)=C(C(=O)OCC)C1c1ccccc1/C=C...,Hit score,'=',0.0561,


# Visualize Data 
- distributions of predictors
- distribtuion of target
- correlations between predictors - explain based on physical properties
- ones passing ghose and lepinski filter