In [1]:
%load_ext autoreload
%autoreload 2

### Setup

In [2]:
import datamol as dm
import random
import numpy as np
from loguru import logger

# set printing option
np.set_printoptions(threshold=10)

# set random list
np.random.seed(10)
random.seed(10)


def check_value_error(fn, mol):
    value_error = False
    try:
        fn(mol)
    except ValueError as e:
        logger.error(e)
        value_error = True
    finally:
        if not value_error:
            raise AssertionError("Error not raised")
    return value_error


#### 2D Pharmacophores

data = dm.data.freesolv().sample(500).smiles.values
mol2d = data[83]
mol3d = dm.conformers.generate(dm.to_mol(mol2d))

### Dealing the calc module of `molfeat`

In [3]:
from molfeat.calc import get_calculator
from molfeat.calc import CATS2D
from molfeat.calc import CATS3D
from molfeat.calc import RDKitDescriptors2D
from molfeat.calc import RDKitDescriptors3D
from molfeat.calc import MordredDescriptors
from molfeat.calc import FPCalculator
from molfeat.calc import Pharmacophore
from molfeat.calc import PMapper3D
from molfeat.calc import ScaffoldKeyCalculator

Using backend: pytorch


#### CATS Descriptors

In [4]:
cats_calc = CATS2D(scale="num", max_dist=5, bins=[0, 3, 5, 7])

In [5]:
print(len(cats_calc))
cats_calc(mol2d)

63


array([0., 0., 0., ..., 0., 0., 0.])

In [6]:
cats3d_calc = CATS3D(scale="num", max_dist=5, bins=[0, 3, 5, 7])

In [7]:
# we expect and error on molecules without conformer
check_value_error(cats3d_calc, mol2d)

2021-07-16 17:54:05.911 | ERROR    | __main__:check_value_error:18 - Expected a molecule with conformers information.


True

In [8]:
# should work if we compute the conformer
cats3d_calc(mol3d)

array([0., 0., 0., ..., 0., 0., 0.])

#### RDKit molecule descriptors

In [9]:
rdkit_desc = RDKitDescriptors2D(replace_nan=False, augment=True)
rdkit_desc(mol2d)

array([ 6.82415895, -1.37469136,  6.82415895, ...,  0.        ,
        0.        ,  3.        ])

In [10]:
rdkit3d_desc = RDKitDescriptors3D(replace_nan=False)
# check that we get different vector for different conformers
conf1_out = rdkit3d_desc(mol3d, conformer_id=2)
conf_out = rdkit3d_desc(mol3d, conformer_id=-1)
assert np.any(conf1_out != conf_out)

In [11]:
# show the name of the descriptors
# Most calculators have the columns attributes
print(rdkit3d_desc.columns[:10])

['CalcAsphericity', 'CalcEccentricity', 'CalcInertialShapeFactor', 'CalcNPR1', 'CalcNPR2', 'CalcPMI1', 'CalcPMI2', 'CalcPMI3', 'CalcRadiusOfGyration', 'CalcSpherocityIndex']


#### Mordred descriptors

Mordred descriptors can results in nan values. The default behaviour is to keep the nan values as is. If you do no plan to perform imputation, specify `replace_nan=True`. 

In [12]:
mrd_desc = MordredDescriptors(ignore_3D=True, replace_nan=True)
mrd3d_desc = MordredDescriptors(ignore_3D=False, replace_nan=False)
print("2D only", len(mrd_desc), "2D+3D", len(mrd3d_desc))

2D only 1613 2D+3D 1826


In [13]:
# 2D descriptors should not differentiate between presence of conformers
np.all(mrd_desc(mol2d) == mrd_desc(mol3d))

True

In [14]:
# we expect to have more nan values on the mol2d with 3D descriptors
assert np.isnan(mrd3d_desc(mol2d)).sum() >= np.isnan(mrd3d_desc(mol3d)).sum()

#### Fingerprints 

In [15]:
maccs_calc = FPCalculator("maccs")
maccs_calc(mol2d)

array([0, 0, 0, ..., 1, 1, 0], dtype=uint8)

In [16]:
secfp_cal = FPCalculator("secfp", length=1024)
# check that we have activated bits
np.sum(secfp_cal(mol2d))

33

In [17]:
# check the fingerprinter length
print(len(secfp_cal))

1024


In [18]:
# count vector are folded to the desired length
morgan_count_calc = FPCalculator(
    "ecfp-count", length=256, radius=3, useChirality=True, includeRedundantEnvironments=True
)
val = morgan_count_calc(mol2d)
val

array([0, 0, 0, ..., 1, 0, 0])

In [19]:
# checking that changing the parameters does change the fingerprint
morgan_count_calc2 = FPCalculator("ecfp-count", length=256, radius=2)
np.any(morgan_count_calc2(mol2d) != val)

True

In [20]:
# checking that we can return the original vector type
rdkit_calc = FPCalculator("rdkit-count", length=512)
np.asarray(list(rdkit_calc(mol2d, raw=False))).max()

138

#### 2D Pharmacophores

In [21]:
pharm_calc = Pharmacophore(factory="pmapper")
print(len(pharm_calc))

19355


In [22]:
# check case where we provide a max length
pharm_calc_1024 = Pharmacophore(factory="pmapper", max_length=1024, useCounts=True)
print(len(pharm_calc_1024))

1024


In [23]:
# check the folding into 1024 bits and count info used
arr = pharm_calc_1024(mol2d)
print(arr.nonzero())

(array([  14,   15,   16, ...,  995, 1016, 1018]),)


In [24]:
arr = pharm_calc(mol2d)
print(arr.nonzero())

(array([   14,    15,    16, ..., 12600, 12601, 12608]),)


#### 3D Pharmacophores aka PMapper3D

In [25]:
pmap_calc = PMapper3D(min_features=2, max_features=3, length=512, tol=5, use_modulo=False)

In [26]:
# Expect error here
check_value_error(pmap_calc, mol2d)

2021-07-16 17:54:08.550 | ERROR    | __main__:check_value_error:18 - Expected a molecule with conformers information.


True

In [27]:
val = pmap_calc(mol3d, conformer_id=0)
print(val.nonzero())

(array([ 16,  24,  32, ..., 482, 483, 492]),)


In [28]:
## check that the new fingerprint is the same as the original pmapper get_fp
from pmapper.pharmacophore import Pharmacophore as P

p = P()
p.load_from_mol(mol3d)
# generate 3D pharmacophore fingerprint which takes into account stereoconfiguration
b = p.get_fp(min_features=2, max_features=3, tol=5, nbits=512)  # set of activated bits
assert set(val.nonzero()[0]) == b

#### Scaffold keys

In [29]:
skey = ScaffoldKeyCalculator()
skey(mol2d)

array([13,  0, 23, ...,  0,  0,  0])

#### Test `get_calculator`

In [30]:
calc = get_calculator("pmapper", min_features=1)
calc(mol3d)

array([0., 0., 0., ..., 0., 0., 0.])