In [1]:
import os,re,sys,pickle,datetime,time,random,itertools
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import openpyxl
from openpyxl import load_workbook
import pandas as pd
import math
import seaborn as sns 
import umap
import umap.plot
from PIL import Image
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.decomposition import PCA
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
randomstate = 42
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, MolFromSmiles, PandasTools, Descriptors, Draw, PropertyMol, rdmolfiles, rdFMCS
from rdkit import RDConfig
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.Draw import MolsToGridImage, IPythonConsole, rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import MolDrawing,DrawingOptions, Font
DrawingOptions.bondLineWidth=1.8
DrawingOptions.includeAtomNumbers=False
size = (150, 150)
import plotly.express as px
import molplotly # requires dash version 2.0 (latest = 2.1); use pip list and pip install dash==2.0
import mols2grid
from tdc.single_pred import ADME
import useful_rdkit_utils as uru
from tqdm import tqdm
reos = uru.REOS()
ro5_calc = uru.Ro5Calculator()

**Calculate "rule of 5" descriptors**

Lipinski's rule of five is used to evaluate "druglikeness" i.e. whether a chemical compound has the properties to be an orally active drug in humans. It is important to maintain drug-like physicochemical properties as a pharmacologically active lead structure is optimized to increase activity and selectivity. The rule describes molecular properties important for a drug's pharmacokinetics in the human body (absorption, distribution, metabolism, and excretion). Candidate drugs that conform to the RO5 tend to have lower attrition rates during clinical trials however, there are many exceptions; only about 50 % of orally administered new chemical entities actually obey the rules.

Code from: https://github.com/PatWalters/useful_rdkit_utils/blob/master/notebooks/demo_REOS.ipynb

## Descriptor calculation

Read in data

In [2]:
df = pd.read_csv("drugbank_data_v1.csv")     
df = df.drop(columns=['Unnamed: 0','dimension 1','dimension 2'])      # drop some unnecessary columns
df.index = df.index.astype(int)
df

Unnamed: 0,CID,names,smiles,rdmol_optimized
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i..."
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i..."
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i..."
...,...,...,...,...
10616,60852,Ibandronic Acid|Ibandronate|DB00710,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
10617,92196,4-Methylaminorex|DB01447,CC1C(OC(=N1)N)C2=CC=CC=C2,"<img data-content=""rdkit/molecule"" src=""data:i..."
10618,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325,CC(C)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
10619,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...,CN(C)CCC(C1=CC=C(C=C1)Br)C2=CC=CC=N2,"<img data-content=""rdkit/molecule"" src=""data:i..."


In [4]:
#df2 = df.copy()
df2 = df.head(20)
df2

Unnamed: 0,CID,names,smiles,rdmol_optimized
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i..."
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i..."
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i..."
5,135398619,guanosine-5'-diphosphate|DB04315,C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O...,"<img data-content=""rdkit/molecule"" src=""data:i..."
6,54688261,DB02507|4-HYDROXY-3-[(1S)-3-OXO-1-PHENYLBUTYL]...,CC(=O)C[C@@H](C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
7,145729,DEOXYURIDINE-5'-DIPHOSPHATE|DB03413,C1[C@@H]([C@H](O[C@H]1N2C=CC(=O)NC2=O)COP(=O)(...,"<img data-content=""rdkit/molecule"" src=""data:i..."
8,2471,bumetanide|3-(Aminosulfonyl)-5-(butylamino)-4-...,CCCCNC1=C(C(=CC(=C1)C(=O)O)S(=O)(=O)N)OC2=CC=C...,"<img data-content=""rdkit/molecule"" src=""data:i..."
9,444425,tl-3-093|DB01891,C[C@@H](C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC1=CC...,"<img data-content=""rdkit/molecule"" src=""data:i..."


Calculate descriptors (10621 DrugBank molecules took 5 seconds)

In [10]:
df2['MolWt'], df2['LogP'], df2['HBD'], df2['HBA'], df2['TPSA'] = "","","","",""
df2[ro5_calc.names] = df2.smiles.apply(ro5_calc.calc_smiles).to_list()
df2

Unnamed: 0,CID,names,smiles,rdmol_optimized,MolWt,LogP,HBD,HBA,TPSA
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i...",237.208,-2.8577,5.0,7.0,128.48
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",226.275,3.5417,1.0,1.0,37.3
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",185.185,0.202,3.0,3.0,83.55
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i...",218.253,0.9834,2.0,4.0,104.64
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i...",537.576,4.6392,2.0,2.0,58.2
5,135398619,guanosine-5'-diphosphate|DB04315,C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O...,"<img data-content=""rdkit/molecule"" src=""data:i...",443.202,-2.4527,7.0,12.0,252.57
6,54688261,DB02507|4-HYDROXY-3-[(1S)-3-OXO-1-PHENYLBUTYL]...,CC(=O)C[C@@H](C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",308.333,3.6096,1.0,4.0,67.51
7,145729,DEOXYURIDINE-5'-DIPHOSPHATE|DB03413,C1[C@@H]([C@H](O[C@H]1N2C=CC(=O)NC2=O)COP(=O)(...,"<img data-content=""rdkit/molecule"" src=""data:i...",388.162,-1.5887,5.0,9.0,197.61
8,2471,bumetanide|3-(Aminosulfonyl)-5-(butylamino)-4-...,CCCCNC1=C(C(=CC(=C1)C(=O)O)S(=O)(=O)N)OC2=CC=C...,"<img data-content=""rdkit/molecule"" src=""data:i...",364.423,3.0365,3.0,5.0,118.72
9,444425,tl-3-093|DB01891,C[C@@H](C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC1=CC...,"<img data-content=""rdkit/molecule"" src=""data:i...",455.555,2.1619,4.0,5.0,116.76


## Search for "drug-like" compounds

Add in True/False for passing Lipinski's rule of five

In [11]:
df2["Pass Ro5"] = (df2["MolWt"] <= 500) & (df2["LogP"] <= 5) & (df2["HBD"] <= 5) & (df2["HBA"] <= 10) & (df2["TPSA"] <= 140)
df2

Unnamed: 0,CID,names,smiles,rdmol_optimized,MolWt,LogP,HBD,HBA,TPSA,Pass Ro5
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i...",237.208,-2.8577,5.0,7.0,128.48,True
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",226.275,3.5417,1.0,1.0,37.3,True
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",185.185,0.202,3.0,3.0,83.55,True
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i...",218.253,0.9834,2.0,4.0,104.64,True
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i...",537.576,4.6392,2.0,2.0,58.2,False
5,135398619,guanosine-5'-diphosphate|DB04315,C1=NC2=C(N1[C@H]3[C@@H]([C@@H]([C@H](O3)COP(=O...,"<img data-content=""rdkit/molecule"" src=""data:i...",443.202,-2.4527,7.0,12.0,252.57,False
6,54688261,DB02507|4-HYDROXY-3-[(1S)-3-OXO-1-PHENYLBUTYL]...,CC(=O)C[C@@H](C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",308.333,3.6096,1.0,4.0,67.51,True
7,145729,DEOXYURIDINE-5'-DIPHOSPHATE|DB03413,C1[C@@H]([C@H](O[C@H]1N2C=CC(=O)NC2=O)COP(=O)(...,"<img data-content=""rdkit/molecule"" src=""data:i...",388.162,-1.5887,5.0,9.0,197.61,False
8,2471,bumetanide|3-(Aminosulfonyl)-5-(butylamino)-4-...,CCCCNC1=C(C(=CC(=C1)C(=O)O)S(=O)(=O)N)OC2=CC=C...,"<img data-content=""rdkit/molecule"" src=""data:i...",364.423,3.0365,3.0,5.0,118.72,True
9,444425,tl-3-093|DB01891,C[C@@H](C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC1=CC...,"<img data-content=""rdkit/molecule"" src=""data:i...",455.555,2.1619,4.0,5.0,116.76,True


In [12]:
df_ro5_pass = df2.loc[df2['Pass Ro5']==True]
df_ro5_pass

Unnamed: 0,CID,names,smiles,rdmol_optimized,MolWt,LogP,HBD,HBA,TPSA,Pass Ro5
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i...",237.208,-2.8577,5.0,7.0,128.48,True
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",226.275,3.5417,1.0,1.0,37.3,True
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",185.185,0.202,3.0,3.0,83.55,True
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i...",218.253,0.9834,2.0,4.0,104.64,True
6,54688261,DB02507|4-HYDROXY-3-[(1S)-3-OXO-1-PHENYLBUTYL]...,CC(=O)C[C@@H](C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",308.333,3.6096,1.0,4.0,67.51,True
8,2471,bumetanide|3-(Aminosulfonyl)-5-(butylamino)-4-...,CCCCNC1=C(C(=CC(=C1)C(=O)O)S(=O)(=O)N)OC2=CC=C...,"<img data-content=""rdkit/molecule"" src=""data:i...",364.423,3.0365,3.0,5.0,118.72,True
9,444425,tl-3-093|DB01891,C[C@@H](C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC1=CC...,"<img data-content=""rdkit/molecule"" src=""data:i...",455.555,2.1619,4.0,5.0,116.76,True
15,233240,1H-BENOXIMIDAZOLE-2-CARBOXYLIC ACID|DB03028,C1=CC=C2C(=C1)NC(=N2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...",162.148,1.2611,2.0,2.0,65.98,True
16,449088,N-(5-CYCLOPROPYL-1H-PYRAZOL-3-YL)BENZAMIDE|DB0...,C1CC1C2=CC(=NN2)NC(=O)C3=CC=CC=C3,"<img data-content=""rdkit/molecule"" src=""data:i...",227.267,2.5394,2.0,2.0,57.78,True
18,3260017,Oxaloacetate Ion|DB02637,C(C(=O)C(=O)O)C(=O)[O-],"<img data-content=""rdkit/molecule"" src=""data:i...",131.063,-2.2199,1.0,4.0,94.5,True


In [None]:
df_ro5_pass.to_csv('Ro5_passes.csv', index = True)

## Add the descriptors to an existing data set

In [None]:
df = pd.read_csv("drugbank_data_1.csv")     
df = df.drop(columns=['Unnamed: 0'])
df.index = df.index.astype(int)
df2 = df.copy()
df2

In [None]:
df2['MolWt'], df2['LogP'], df2['HBD'], df2['HBA'], df2['TPSA'] = "","","","",""
df2[ro5_calc.names] = df2.smiles.apply(ro5_calc.calc_smiles).to_list()
df2["Pass Ro5"] = (df2["MolWt"] <= 500) & (df2["LogP"] <= 5) & (df2["HBD"] <= 5) & (df2["HBA"] <= 10) & (df2["TPSA"] <= 140)
df2

In [None]:
df2.to_csv('drugbank_data_v2_Ro5.csv', index = True)