# Morgan Fingerprints

In [1]:
#imports
import pandas as pd
from rdkit import Chem
from tdc.single_pred import ADME

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# load data
from tdc.single_pred import ADME
data = ADME(name = 'Solubility_AqSolDB')

#train valid test split (0.7, 0.1, 0.2 by default)
split = data.get_split()

train_df = split['train']
val_df = split['valid']
test_df = split['test']

Found local copy...
Loading...
Done!


In [3]:
# Unit Conversion: The chosen unit of solubility in this dataset is LogS, where S is the aqueous solubility in mol/L (or M). Units such as g/L and mg/L were converted to LogS using the molecular mass of the compounds.
# Sorkun, M.C., Khetan, A. & Er, S. AqSolDB, a curated reference set of aqueous solubility and 2D descriptors for a diverse set of compounds. Sci Data 6, 143 (2019). https://doi.org/10.1038/s41597-019-0151-1
train_df['Y'].describe()

count    6988.000000
mean       -2.862946
std         2.387322
min       -13.171900
25%        -4.280085
50%        -2.591850
75%        -1.155815
max         2.137682
Name: Y, dtype: float64

In [4]:
# load molecules
train_df['Drug'] = train_df['Drug'].apply(Chem.MolFromSmiles)
val_df['Drug'] = val_df['Drug'].apply(Chem.MolFromSmiles)
test_df['Drug'] = test_df['Drug'].apply(Chem.MolFromSmiles)
#sanity check
train_df



Unnamed: 0,Drug_ID,Drug,Y
0,Benzo[cd]indol-2(1H)-one,<rdkit.Chem.rdchem.Mol object at 0x000001D3522...,-3.254767
1,4-chlorobenzaldehyde,<rdkit.Chem.rdchem.Mol object at 0x000001D3522...,-2.177078
2,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,<rdkit.Chem.rdchem.Mol object at 0x000001D3522...,-4.662065
3,vinyltoluene,<rdkit.Chem.rdchem.Mol object at 0x000001D3522...,-3.123150
4,3-(3-ethylcyclopentyl)propanoic acid,<rdkit.Chem.rdchem.Mol object at 0x000001D3522...,-3.286116
...,...,...,...
6983,sarafloxacin,<rdkit.Chem.rdchem.Mol object at 0x000001D3552...,-3.130000
6984,sparfloxacin,<rdkit.Chem.rdchem.Mol object at 0x000001D3552...,-3.370000
6985,sulindac_form_II,<rdkit.Chem.rdchem.Mol object at 0x000001D3552...,-4.500000
6986,tetracaine,<rdkit.Chem.rdchem.Mol object at 0x000001D3552...,-3.010000


In [6]:
# Fingerprint Conversion with Morgan Fingerprints
from rdkit.Chem import rdFingerprintGenerator

def embed(molecule):
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
    Chem.AddHs(molecule)
    fp = mfpgen.GetFingerprintAsNumPy(molecule)
    return fp

train_df['Drug'] = train_df['Drug'].apply(embed)
val_df['Drug'] = val_df['Drug'].apply(embed)
test_df['Drug'] = test_df['Drug'].apply(embed)

#sanity check
train_df

Unnamed: 0,Drug_ID,Drug,Y
0,Benzo[cd]indol-2(1H)-one,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-3.254767
1,4-chlorobenzaldehyde,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-2.177078
2,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-4.662065
3,vinyltoluene,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-3.123150
4,3-(3-ethylcyclopentyl)propanoic acid,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-3.286116
...,...,...,...
6983,sarafloxacin,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-3.130000
6984,sparfloxacin,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-3.370000
6985,sulindac_form_II,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-4.500000
6986,tetracaine,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",-3.010000


In [8]:
# explore fps
s = train_df['Drug'][42]
print(len(s))
print(type(s))
print(s)

2048
<class 'numpy.ndarray'>
[0 1 0 ... 0 0 0]


### Model Constructor

In [9]:
import torch