In [2]:
import os,re,sys,pickle,datetime,time,random,itertools
import warnings
warnings.filterwarnings("ignore")
import numpy as np
#np.set_printoptions(threshold=sys.maxsize)
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import openpyxl
from openpyxl import load_workbook
import pandas as pd
import math
import seaborn as sns 
import umap
import umap.plot
from PIL import Image
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.decomposition import PCA
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
randomstate = 42
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, MolFromSmiles, PandasTools, Descriptors, Draw, PropertyMol, rdmolfiles, rdFMCS
from rdkit import RDConfig
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.Draw import MolsToGridImage, IPythonConsole, rdMolDraw2D
from rdkit.Chem.Draw.MolDrawing import MolDrawing,DrawingOptions, Font
DrawingOptions.bondLineWidth=1.8
DrawingOptions.includeAtomNumbers=False
size = (150, 150)
import plotly.express as px
import molplotly
import mols2grid
from chembl_webresource_client.new_client import new_client
from IPython.display import SVG
import pubchempy as pcp
from pubchempy import get_compounds, Compound
import mordred
from mordred import Calculator, descriptors
from tqdm import tqdm

def embed(mol):
    mol_with_H = Chem.AddHs(mol)
    Chem.AllChem.EmbedMolecule(mol_with_H)
    Chem.AllChem.MMFFOptimizeMolecule(mol_with_H)
    return mol_with_H

**Overview**

PubChem is a database of 111 million chemical molecules and their activities against biological assays. PubChemPy connects with PubChem so requires internet access. Pubchem uses chemical IDs (CID) for compounds.

https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html

https://github.com/mcs07/PubChemPy


**Workflow**

Download a csv file containing a collection of compounds from a PubChem search. Alternatively, go to https://pubchem.ncbi.nlm.nih.gov/source/ to search for various data sources within pubchem. Select one of interest e.g. DrugBank then click '[no.] live substances'. Click 'View or Download Structures in PubChem', 'Download', and 'CSV' under Summary tab. 

This notebook will clean that data, calculate descriptors, reduce dimensionality, and provide an updated CSV file.

In [3]:
df = pd.read_csv("PubChem_drugbank.csv")
df = df.drop(columns=['sid','sidsrcname', 'sidextid','sidsrcname', 'sidmdate','depdate', 'depcatg','annotation'])
df['cid'] = df['cid'].fillna(0)            # change errors to 0
df['cid'] = df['cid'].astype(np.int64)
df

Unnamed: 0,cid,subssynonym
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE..."
2,444719,METHIONINE PHOSPHONATE|DB02151
3,4064,meprobamate|Meprobamic acid|DB00371
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122
...,...,...
13570,60852,Ibandronic Acid|Ibandronate|DB00710
13571,92196,4-Methylaminorex|DB01447
13572,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325
13573,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...


Select a portion of dataset for analysis (optional)

In [4]:
#df2 = df.head(3000)
df2 = df
df2

Unnamed: 0,cid,subssynonym
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE..."
2,444719,METHIONINE PHOSPHONATE|DB02151
3,4064,meprobamate|Meprobamic acid|DB00371
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122
...,...,...
13570,60852,Ibandronic Acid|Ibandronate|DB00710
13571,92196,4-Methylaminorex|DB01447
13572,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325
13573,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...


Obtain smiles from pubchem using the CIDs (slow, 10k molecules = 1h)

In [5]:
smiles=[]
for i in tqdm(df2['cid'], desc = 'Extracting smiles'):    # tqdm adds progress bar
    try:
        c = pcp.Compound.from_cid(i)
        smiles.append(c.isomeric_smiles)
    except:
        smiles.append(np.nan)                             # any errors give NaN values
df2['smiles']=smiles
df2.dropna(axis=0,inplace=True)                           # drop any errors (rows)
df2.reset_index(drop=True,inplace=True)                   # reset the index
df2

Extracting smiles: 100%|██████████| 13575/13575 [1:31:35<00:00,  2.47it/s]  


Unnamed: 0,cid,subssynonym,smiles
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...
...,...,...,...
10739,60852,Ibandronic Acid|Ibandronate|DB00710,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O
10740,92196,4-Methylaminorex|DB01447,CC1C(OC(=N1)N)C2=CC=CC=C2
10741,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325,CC(C)O
10742,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...,CN(C)CCC(C1=CC=C(C=C1)Br)C2=CC=CC=N2


Make rdkit molecule objects

In [6]:
rdkit_obj = []
for i in tqdm(df2['smiles'], desc = 'Making rdkit objects'):    # tqdm adds progress bar
    try:
        x = Chem.MolFromSmiles(i)
        rdkit_obj.append(x)
    except:
        rdkit_obj.append(np.nan)                                # any errors give NaN values
df2['rdmol']=rdkit_obj
df3=df2.copy()
df3.dropna(axis=0,inplace=True)                                 # drop any errors (rows)
df3.reset_index(drop=True,inplace=True)                         # reset the index
df3


#df2['rdmol'] = df2['smiles'].map(lambda x: Chem.MolFromSmiles(x)) # faster method but doesnt handle errors

Making rdkit objects:   0%|          | 0/10744 [00:00<?, ?it/s]RDKit ERROR: [07:21:12] Explicit valence for atom # 7 Cl, 5, is greater than permitted
Making rdkit objects: 100%|██████████| 10744/10744 [00:01<00:00, 6119.36it/s]


Unnamed: 0,cid,subssynonym,smiles,rdmol
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i..."
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i..."
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i..."
...,...,...,...,...
10738,60852,Ibandronic Acid|Ibandronate|DB00710,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
10739,92196,4-Methylaminorex|DB01447,CC1C(OC(=N1)N)C2=CC=CC=C2,"<img data-content=""rdkit/molecule"" src=""data:i..."
10740,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325,CC(C)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
10741,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...,CN(C)CCC(C1=CC=C(C=C1)Br)C2=CC=CC=N2,"<img data-content=""rdkit/molecule"" src=""data:i..."


Calculate optimized rdkit molecules (slow, 10k molecules = 1h)

In [7]:
opt_mols = []
for i in tqdm(df3['rdmol'], desc = 'Optimizing mols'):          # tqdm adds progress bar
    try:
        mol_with_H = Chem.AddHs(i)
        Chem.AllChem.EmbedMolecule(mol_with_H)
        Chem.AllChem.MMFFOptimizeMolecule(mol_with_H)
        opt_mols.append(mol_with_H)
    except:
        opt_mols.append(np.nan)                                 # any errors give NaN values
df3['rdmol_optimized']=opt_mols
df4=df3.copy()
df4.dropna(axis=0,inplace=True)                                 # drop any errors (rows)
df4.reset_index(drop=True,inplace=True)                         # reset the index
df4

#df3['rdmol_optimized'] = df2.rdmol.map(embed) # faster method but doesnt handle errors

Optimizing mols:   0%|          | 24/10743 [00:01<13:46, 12.97it/s]RDKit ERROR: [07:21:25] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:21:25] UFFTYPER: Unrecognized charge state for atom: 4
Optimizing mols:   0%|          | 47/10743 [00:03<17:20, 10.28it/s]RDKit ERROR: [07:21:26] UFFTYPER: Unrecognized atom type: Se2+2 (4)
Optimizing mols:   1%|          | 58/10743 [00:05<33:41,  5.29it/s]RDKit ERROR: [07:21:29] UFFTYPER: Unrecognized charge state for atom: 0
Optimizing mols:   1%|          | 87/10743 [00:09<17:11, 10.33it/s]RDKit ERROR: [07:21:33] UFFTYPER: Unrecognized atom type: Se2+2 (2)
Optimizing mols:   1%|▏         | 135/10743 [00:17<12:31, 14.11it/s]  RDKit ERROR: [07:21:40] UFFTYPER: Unrecognized atom type: Mo2 (1)
RDKit ERROR: [07:21:40] UFFTYPER: Unrecognized charge state for atom: 0
Optimizing mols:   2%|▏         | 169/10743 [02:50<53:32,  3.29it/s]   RDKit ERROR: [07:24:14] UFFTYPER: Unrecognized atom type: Fe3 (0)
RDKit ERROR: [07:24:18] UFFTYPER: U

Optimizing mols:  17%|█▋        | 1776/10743 [07:38<09:00, 16.60it/s]RDKit ERROR: [07:29:02] UFFTYPER: Unrecognized charge state for atom: 1
Optimizing mols:  24%|██▍       | 2556/10743 [08:20<07:22, 18.50it/s]RDKit ERROR: [07:29:44] UFFTYPER: Unrecognized charge state for atom: 22
Optimizing mols:  25%|██▌       | 2739/10743 [08:31<07:08, 18.68it/s]RDKit ERROR: [07:29:55] UFFTYPER: Unrecognized atom type: S_5+4 (5)
Optimizing mols:  27%|██▋       | 2951/10743 [08:42<05:29, 23.66it/s]RDKit ERROR: [07:30:06] UFFTYPER: Unrecognized charge state for atom: 17
Optimizing mols:  28%|██▊       | 3005/10743 [08:47<26:06,  4.94it/s]RDKit ERROR: [07:30:10] UFFTYPER: Unrecognized atom type: Pt3 (0)
Optimizing mols:  28%|██▊       | 3052/10743 [09:15<1:27:02,  1.47it/s]RDKit ERROR: [07:30:38] UFFTYPER: Unrecognized atom type: Pt3 (0)
Optimizing mols:  29%|██▊       | 3065/10743 [09:45<4:47:04,  2.24s/it] RDKit ERROR: [07:31:09] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:31:09

Optimizing mols:  33%|███▎      | 3538/10743 [14:43<06:03, 19.84it/s]RDKit ERROR: [07:36:07] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:36:07] UFFTYPER: Unrecognized atom type: Zn1+2 (0)
Optimizing mols:  33%|███▎      | 3572/10743 [15:00<07:22, 16.20it/s]RDKit ERROR: [07:37:34] UFFTYPER: Unrecognized atom type: V_3 (0)
Optimizing mols:  33%|███▎      | 3573/10743 [16:10<50:27:00, 25.33s/it]RDKit ERROR: [07:37:34] UFFTYPER: Unrecognized atom type: Sr (0)
RDKit ERROR: [07:37:34] UFFTYPER: Unrecognized atom type: Sr (0)
Optimizing mols:  33%|███▎      | 3581/10743 [16:12<12:18:24,  6.19s/it]RDKit ERROR: [07:37:35] UFFTYPER: Unrecognized charge state for atom: 5
RDKit ERROR: [07:37:35] UFFTYPER: Unrecognized atom type: Au5 (0)
RDKit ERROR: [07:37:35] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:37:35] UFFTYPER: Unrecognized charge state for atom: 0
Optimizing mols:  34%|███▎      | 3608/10743 [16:16<45:45,  2.60it/s]  RDKit ERROR: [07:37:40] UFFTY

Optimizing mols:  36%|███▌      | 3867/10743 [23:31<04:50, 23.70it/s]RDKit ERROR: [07:44:55] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:44:55] UFFTYPER: Unrecognized atom type: Ca (0)
RDKit ERROR: [07:44:55] UFFTYPER: Unrecognized atom type: Fe3 (0)
RDKit ERROR: [07:44:55] UFFTYPER: Unrecognized atom type: Fe3 (0)
RDKit ERROR: [07:44:55] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:44:55] UFFTYPER: Unrecognized atom type: Zn+2 (0)
RDKit ERROR: [07:44:56] UFFTYPER: Unrecognized charge state for atom: 0
Optimizing mols:  36%|███▌      | 3883/10743 [23:33<07:40, 14.90it/s]RDKit ERROR: [07:44:56] UFFTYPER: Unrecognized atom type: Fe3 (0)
RDKit ERROR: [07:44:57] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:44:57] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:44:57] UFFTYPER: Unrecognized atom type: Cu3 (0)
Optimizing mols:  36%|███▌      | 3892/10743 [23:33<07:40, 14.87it/s]RDKit ERROR: [07:44:57] UFFTYPER: Unr

RDKit ERROR: [07:58:20] UFFTYPER: Unrecognized charge state for atom: 0
Optimizing mols:  55%|█████▌    | 5953/10743 [36:58<03:11, 25.06it/s]RDKit ERROR: [07:58:22] UFFTYPER: Unrecognized atom type: Fe3 (0)
Optimizing mols:  55%|█████▌    | 5957/10743 [36:58<02:55, 27.24it/s]RDKit ERROR: [07:58:22] UFFTYPER: Unrecognized atom type: Fe3 (0)
Optimizing mols:  56%|█████▌    | 5989/10743 [37:01<06:33, 12.07it/s]RDKit ERROR: [07:58:25] UFFTYPER: Unrecognized atom type: Fe3 (0)
Optimizing mols:  56%|█████▌    | 5991/10743 [37:02<06:42, 11.80it/s]RDKit ERROR: [07:58:25] UFFTYPER: Unrecognized atom type: Au5 (0)
Optimizing mols:  56%|█████▌    | 6014/10743 [37:03<04:25, 17.80it/s]RDKit ERROR: [07:58:26] UFFTYPER: Unrecognized atom type: Xe3 (0)
Optimizing mols:  56%|█████▌    | 6035/10743 [37:08<20:05,  3.91it/s]RDKit ERROR: [07:58:31] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:58:31] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [07:58:31] UFFTYPER: Unreco

Optimizing mols:  62%|██████▏   | 6652/10743 [39:39<01:51, 36.81it/s]RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: Cr3 (0)
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: Zn+2 (0)
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: Zn+2 (0)
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: Au5 (0)
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: In+3 (0)
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: Sr (0)
RDKit ERROR: [08:01:03] UFFTYPER: Unrecognized atom type: Cr3 (1)
Optimizing mols:  62%|██████▏   | 6669/10743 [39:41<04:27, 15.24it/s]RDKit ERROR: [08:01:04] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [08:01:04] UFFTYPER: Unrecognized atom type: Ga+3 (0)
RDKit ERROR: [08:01:04] UFFTYPER: Unrecogni

Optimizing mols:  76%|███████▌  | 8154/10743 [48:31<03:11, 13.50it/s]RDKit ERROR: [08:09:55] UFFTYPER: Unrecognized atom type: Fe3 (0)
Optimizing mols:  77%|███████▋  | 8259/10743 [48:45<04:58,  8.33it/s]RDKit ERROR: [08:10:08] UFFTYPER: Unrecognized atom type: Re3 (0)
Optimizing mols:  77%|███████▋  | 8282/10743 [48:50<05:24,  7.58it/s]RDKit ERROR: [08:10:13] UFFTYPER: Unrecognized charge state for atom: 20
Optimizing mols:  77%|███████▋  | 8303/10743 [48:52<04:30,  9.01it/s]RDKit ERROR: [08:10:15] UFFTYPER: Unrecognized atom type: Fe1 (0)
Optimizing mols:  78%|███████▊  | 8433/10743 [49:05<07:12,  5.34it/s]RDKit ERROR: [08:10:28] UFFTYPER: Unrecognized atom type: Mo3 (0)
RDKit ERROR: [08:10:28] UFFTYPER: Unrecognized atom type: Mo3 (0)
RDKit ERROR: [08:10:28] UFFTYPER: Unrecognized atom type: Mo3 (0)
RDKit ERROR: [08:10:28] UFFTYPER: Unrecognized atom type: Mo3 (0)
RDKit ERROR: [08:10:28] UFFTYPER: Unrecognized atom type: Mo3 (0)
RDKit ERROR: [08:10:28] UFFTYPER: Unrecognized atom ty

Unnamed: 0,cid,subssynonym,smiles,rdmol,rdmol_optimized
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
...,...,...,...,...,...
10616,60852,Ibandronic Acid|Ibandronate|DB00710,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
10617,92196,4-Methylaminorex|DB01447,CC1C(OC(=N1)N)C2=CC=CC=C2,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
10618,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325,CC(C)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
10619,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...,CN(C)CCC(C1=CC=C(C=C1)Br)C2=CC=CC=N2,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."


In [8]:
df4.to_csv('checkpoint_df4.csv', index = True)

Calculate (all) mordred descriptors (slow, 10k molecules = 1h)

In [9]:
calc = Calculator(descriptors)
df5=calc.pandas(df4['rdmol_optimized']) # create new df containing descriptors

print(len(df4),'molecules in',len(df5),'molecules out (should match!)') # make sure smiles and descriptors df's match

df5

100%|██████████| 10621/10621 [1:04:29<00:00,  2.74it/s] 


10621 molecules in 10621 molecules out (should match!)


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,11.589507,11.130344,0,0,19.6887,2.40062,4.80123,19.6887,1.23055,3.65928,...,9.588297,48.109140,237.084852,7.647898,432,25,76.0,88.0,7.66667,3.777778
1,12.934771,10.803382,1,0,22.0094,2.33885,4.67771,22.0094,1.29467,3.74693,...,9.555135,49.382507,226.099380,7.293528,558,24,84.0,96.0,5.80556,3.833333
2,6.888497,7.193066,2,1,10.7741,2.22015,4.4403,10.7741,1.07741,3.14309,...,8.717518,38.675269,185.027551,8.410343,130,10,42.0,43.0,5.92361,2.333333
3,10.495973,10.398546,0,0,16.7533,2.26382,4.52764,16.7533,1.11688,3.54923,...,9.048880,45.566268,218.126657,6.609899,406,16,64.0,66.0,7.78472,3.541667
4,26.479190,22.032686,0,2,45.3354,2.35799,4.71599,45.3354,1.25932,4.46929,...,10.329572,72.093870,536.267385,6.875223,4866,56,174.0,199.0,13.7917,8.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10616,13.760274,13.462337,4,1,20.5369,2.48406,4.96813,20.5369,1.08089,3.80788,...,9.932415,52.262605,319.094975,7.597499,756,29,90.0,101.0,10.7986,4.166667
10617,10.037394,9.168374,0,2,16.7305,2.3767,4.63136,16.7305,1.28697,3.50068,...,9.304650,58.064580,176.094963,7.043799,239,16,66.0,76.0,4.19444,2.888889
10618,2.449490,2.449490,0,0,3.4641,1.73205,3.4641,3.4641,0.866025,2.17806,...,6.188264,24.179697,60.057515,5.004793,9,0,12.0,9.0,3.11111,1.000000
10619,14.389425,12.216867,0,1,24.2211,2.34963,4.69926,24.2211,1.27479,3.84888,...,9.572271,51.676805,318.073161,8.370346,718,25,92.0,103.0,6.30556,4.305556


Drop descriptors containing any errors, descriptors where all values = 0, and highly correlated descriptors

In [10]:
df6=df5.copy()
df6=df6.applymap(lambda x: np.nan if type(x) in [mordred.error.Missing,mordred.error.Error] else x)
df6=df6.dropna(axis=1)

non_zero_std = df6.std() != 0
df6 = df6 [non_zero_std[non_zero_std].index]

threshold=0.95
df_corr = df6.corr().abs()
upper = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
df6 = df6.drop(to_drop, axis=1)
df6

Unnamed: 0,ABC,nAcid,nBase,nAromAtom,nSpiro,nBridgehead,nHetero,nB,nN,nO,...,JGI5,JGI6,JGI7,JGI8,JGI9,JGI10,JGT10,Diameter,SRW05,WPath
0,11.589507,0,0,0,0,0,8,0,1,7,...,0.035590,0.025374,0.012897,0.000000,0.000000,0.000000,0.557250,8,0.000000,432
1,12.934771,1,0,12,0,0,2,0,0,2,...,0.028704,0.032163,0.019097,0.011450,0.010000,0.012346,0.438046,10,0.000000,558
2,6.888497,2,1,0,0,0,6,0,1,3,...,0.033333,0.000000,0.000000,0.000000,0.000000,0.000000,0.655185,6,0.000000,130
3,10.495973,0,0,0,0,0,6,0,2,4,...,0.021605,0.022263,0.015625,0.000000,0.000000,0.000000,0.617277,8,0.000000,406
4,26.479190,0,2,12,0,0,8,0,4,2,...,0.022799,0.018744,0.012597,0.010736,0.007679,0.007748,0.513551,19,0.000000,4866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10616,13.760274,4,1,0,0,0,10,0,1,7,...,0.045752,0.015038,0.019886,0.017284,0.013333,0.000000,0.750406,10,0.000000,756
10617,10.037394,0,2,6,0,0,3,0,2,1,...,0.025722,0.020245,0.027778,0.000000,0.000000,0.000000,0.427170,7,2.397895,239
10618,2.449490,0,0,0,0,0,1,0,0,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,2,0.000000,9
10619,14.389425,0,1,12,0,0,3,0,2,0,...,0.021111,0.020204,0.015727,0.021007,0.005208,0.000000,0.397608,9,0.000000,718


In [11]:
df6.to_csv('drugbank_descriptors.csv', index = True)

Scale the descriptors

In [12]:
df6.index = df6.index.astype(int)
#df6.dropna(axis=0,inplace=True)
X_all = np.array(df6)
scaler = StandardScaler()
scaler.fit(X_all)
X_all_sc = scaler.transform(X_all)

Run umap

In [13]:
clusterable_embedding = umap.UMAP(random_state=42).fit_transform(X_all_sc)

print(clusterable_embedding[0])

[13.586549  1.377939]


In [14]:
df_1 = pd.DataFrame(clusterable_embedding, columns = ['dimension 1', 'dimension 2'])
df_1

Unnamed: 0,dimension 1,dimension 2
0,13.586549,1.377939
1,7.037136,3.928394
2,12.965233,3.895618
3,12.105888,3.041144
4,7.161107,10.575061
...,...,...
10616,13.987025,5.628865
10617,9.663800,4.089830
10618,14.827571,2.977263
10619,7.558755,2.631796


Recheck size of df4, make sure the number of rows matches df_1

In [15]:
df4

Unnamed: 0,cid,subssynonym,smiles,rdmol,rdmol_optimized
0,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
1,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
2,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
3,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
4,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
...,...,...,...,...,...
10616,60852,Ibandronic Acid|Ibandronate|DB00710,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
10617,92196,4-Methylaminorex|DB01447,CC1C(OC(=N1)N)C2=CC=CC=C2,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
10618,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325,CC(C)O,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
10619,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...,CN(C)CCC(C1=CC=C(C=C1)Br)C2=CC=CC=N2,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."


In [16]:
df_1['CID'] = df4['cid']
df_1['names'] = df4['subssynonym']
df_1['smiles'] = df4['smiles']
df_1['rdmol_optimized'] = df4['rdmol_optimized']
df_1

Unnamed: 0,dimension 1,dimension 2,CID,names,smiles,rdmol_optimized
0,13.586549,1.377939,445722,DB03657|N-(methoxycarbonyl)-beta-D-glucopyrano...,COC(=O)N[C@H]1[C@@H]([C@H]([C@@H]([C@H](O1)CO)...,"<img data-content=""rdkit/molecule"" src=""data:i..."
1,7.037136,3.928394,448006,"(+)-2-(4-biphenyl)propionic acid|2-(1,1'-BIPHE...",C[C@@H](C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
2,12.965233,3.895618,444719,METHIONINE PHOSPHONATE|DB02151,CSCC[C@H](N)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
3,12.105888,3.041144,4064,meprobamate|Meprobamic acid|DB00371,CCCC(C)(COC(=O)N)COC(=O)N,"<img data-content=""rdkit/molecule"" src=""data:i..."
4,7.161107,10.575061,2131,ambenonium|Ambenonum|Ambenonium Base|DB01122,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)CC1=CC=...,"<img data-content=""rdkit/molecule"" src=""data:i..."
...,...,...,...,...,...,...
10616,13.987025,5.628865,60852,Ibandronic Acid|Ibandronate|DB00710,CCCCCN(C)CCC(O)(P(=O)(O)O)P(=O)(O)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
10617,9.663800,4.089830,92196,4-Methylaminorex|DB01447,CC1C(OC(=N1)N)C2=CC=CC=C2,"<img data-content=""rdkit/molecule"" src=""data:i..."
10618,14.827571,2.977263,3776,isopropanol|Isopropyl alcohol|2-Propanol|DB02325,CC(C)O,"<img data-content=""rdkit/molecule"" src=""data:i..."
10619,7.558755,2.631796,6834,BROMPHENIRAMINE|1-(p-Bromophenyl)-1-(2-pyridyl...,CN(C)CCC(C1=CC=C(C=C1)Br)C2=CC=CC=N2,"<img data-content=""rdkit/molecule"" src=""data:i..."


Export the final csv file

In [17]:
df_1.to_csv('drugbank_umap_data.csv', index = True)

**Misc. PubChemPy tools**

Information on a particular molecule

In [None]:
c = pcp.Compound.from_cid(5090)

print(c.iupac_name)
print(c.synonyms[0])
print(c.molecular_formula)
print(c.molecular_weight)
print(c.isomeric_smiles)
print(c.xlogp)

Search for a molecule using name, smiles, sdf, inchi, inchikey, or formula

In [None]:
compounds = pcp.get_compounds('Glucose', 'name')
print(compounds)
for i in compounds:
    print(i.isomeric_smiles)

Similarity calculation

In [None]:
def tanimoto(compound1, compound2):
    fp1 = int(compound1.fingerprint, 16)
    fp2 = int(compound2.fingerprint, 16)
    fp1_count = bin(fp1).count('1')
    fp2_count = bin(fp2).count('1')
    both_count = bin(fp1 & fp2).count('1')
    return float(both_count) / (fp1_count + fp2_count - both_count)

mol1 = pcp.Compound.from_cid(108770)
mol2 = pcp.Compound.from_cid(2244)

tanimoto(mol1, mol2)