In [1]:
import torch
import numpy as np
import os
from torch import nn
import sys
import polars as pl
import sklearn
import shap
from functools import partial
import pandas as pd
import random
from matplotlib import pyplot as plt
import time
from rdkit.Chem import DataStructs
from tqdm import tqdm
from joblib import Parallel, delayed
import math
from skfp import fingerprints as skfps


sys.path.append('../')

In [2]:
test_df = pl.scan_csv('/home/dangnh36/datasets/competitions/leash_belka/processed/test_v4.csv').with_row_index('index').select(
        pl.col('molecule'),
        pl.col('index', 'mol_group'),
#         pl.col('bb1', 'bb2', 'bb3').cast(pl.UInt16),
        # pl.col('BRD4', 'HSA', 'sEH').cast(pl.UInt8),
    ).collect()
print(test_df.estimated_size('gb'), 'GB')
test_df.group_by('mol_group').count().sort('mol_group')

0.07109459023922682 GB


  test_df.group_by('mol_group').count().sort('mol_group')


mol_group,count
i64,u32
0,369039
1,486390
2,11271
3,11322


In [3]:
def test_fp(fp_class):
    try:
        start = time.time()
        smiles = test_df[:200, 'molecule']
        fp = fp_class()
        fps = fp.transform(smiles)
        end = time.time()
        take = round((end - start) * 1000, 2)
        print('Take:', take, 'ms')
        # assert (0 <= fps).all() and (1 >= fps).all()
        print(type(fps), fps.dtype, fps.shape)
        print(fps)
        return take, fps.dtype, fps.shape, ''
    except Exception as e:
        return None, None, None, str(e)

In [4]:
from skfp.bases import BaseFingerprintTransformer
import inspect

all_skfp_classes = []
for e in dir(skfps):
    att = getattr(skfps, e)
    if inspect.isclass(att) and issubclass(att, BaseFingerprintTransformer):
        all_skfp_classes.append(att)

all_skfp_classes

[skfp.fingerprints.atom_pair.AtomPairFingerprint,
 skfp.fingerprints.autocorr.AutocorrFingerprint,
 skfp.fingerprints.avalon.AvalonFingerprint,
 skfp.fingerprints.e3fp_fp.E3FPFingerprint,
 skfp.fingerprints.ecfp.ECFPFingerprint,
 skfp.fingerprints.erg.ERGFingerprint,
 skfp.fingerprints.estate.EStateFingerprint,
 skfp.fingerprints.functional_groups.FunctionalGroupsFingerprint,
 skfp.fingerprints.getaway.GETAWAYFingerprint,
 skfp.fingerprints.ghose_crippen.GhoseCrippenFingerprint,
 skfp.fingerprints.klekota_roth.KlekotaRothFingerprint,
 skfp.fingerprints.laggner.LaggnerFingerprint,
 skfp.fingerprints.layered.LayeredFingerprint,
 skfp.fingerprints.lingo.LingoFingerprint,
 skfp.fingerprints.maccs.MACCSFingerprint,
 skfp.fingerprints.map.MAPFingerprint,
 skfp.fingerprints.mhfp.MHFPFingerprint,
 skfp.fingerprints.morse.MORSEFingerprint,
 skfp.fingerprints.mqns.MQNsFingerprint,
 skfp.fingerprints.mordred_fp.MordredFingerprint,
 skfp.fingerprints.pattern.PatternFingerprint,
 skfp.fingerprints.

In [46]:
all_dtypes = []
all_shapes = []
all_errs = []
all_class_name = []
takes = []

for fp_class in tqdm(all_skfp_classes):
    take, dtype, shape, err = test_fp(fp_class)
    all_class_name.append(fp_class.__name__)
    all_dtypes.append(dtype)
    all_shapes.append(shape)
    all_errs.append(err)
    takes.append(take)

df = pd.DataFrame({
    'class': all_class_name,
    'dtype': all_dtypes,
    'shape': all_shapes,
    'error': all_errs,
    'take': takes
})
df

  6%|██▊                                        | 2/31 [00:00<00:03,  8.74it/s]

Take: 128.21 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]]
Take: 102.16 ms
<class 'numpy.ndarray'> float64 (200, 192)
[[4.012 4.202 4.446 ... 1.174 0.68  1.323]
 [4.145 4.379 4.589 ... 1.152 0.87  1.108]
 [4.036 4.286 4.524 ... 1.07  0.874 1.473]
 ...
 [4.28  4.512 4.668 ... 1.089 0.821 1.109]
 [4.288 4.519 4.685 ... 1.161 0.814 0.929]
 [4.226 4.464 4.653 ... 1.034 0.991 0.848]]


 10%|████▏                                      | 3/31 [00:00<00:09,  3.00it/s]

Take: 591.92 ms
<class 'numpy.ndarray'> uint8 (200, 512)
[[0 0 0 ... 1 0 0]
 [1 0 0 ... 1 1 1]
 [0 0 0 ... 1 0 0]
 ...
 [1 0 0 ... 1 1 1]
 [0 1 0 ... 1 1 0]
 [1 0 0 ... 1 1 1]]
Take: 94.46 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


 19%|████████▎                                  | 6/31 [00:01<00:04,  6.10it/s]

Take: 134.01 ms
<class 'numpy.ndarray'> float64 (200, 315)
[[0.  0.3 1.9 ... 0.  0.  0. ]
 [0.  0.3 1.9 ... 0.  0.  0. ]
 [0.  0.6 2.9 ... 0.  0.  0. ]
 ...
 [0.  0.3 2.2 ... 0.  0.  0. ]
 [0.  0.3 1.9 ... 0.  0.  0. ]
 [0.  0.3 2.2 ... 0.  0.  0. ]]


 26%|███████████                                | 8/31 [00:01<00:04,  4.91it/s]

Take: 341.23 ms
<class 'numpy.ndarray'> float64 (200, 79)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Take: 192.67 ms
<class 'numpy.ndarray'> uint8 (200, 85)
[[0 0 0 ... 0 1 0]
 [0 1 1 ... 0 1 0]
 [0 1 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 1 1 ... 0 1 0]]


 32%|█████████████▌                            | 10/31 [00:01<00:03,  5.93it/s]

Take: 238.62 ms
<class 'numpy.ndarray'> uint8 (200, 110)
[[0 0 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]]


 35%|██████████████▉                           | 11/31 [00:10<00:41,  2.05s/it]

Take: 8255.99 ms
<class 'numpy.ndarray'> uint8 (200, 4860)
[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


 39%|████████████████▎                         | 12/31 [00:11<00:34,  1.82s/it]

Take: 1100.49 ms
<class 'numpy.ndarray'> uint8 (200, 307)
[[0 1 0 ... 0 0 1]
 [1 1 0 ... 0 0 1]
 [1 1 0 ... 0 0 1]
 ...
 [1 1 0 ... 0 0 1]
 [1 1 0 ... 0 0 1]
 [1 1 0 ... 0 0 1]]


 42%|█████████████████▌                        | 13/31 [00:12<00:28,  1.57s/it]

Take: 884.91 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 1 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 [0 1 1 ... 1 0 0]
 ...
 [0 1 1 ... 0 0 1]
 [0 1 1 ... 0 0 1]
 [0 1 1 ... 0 0 0]]
Take: 31.18 ms
<class 'numpy.ndarray'> uint8 (200, 1024)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


 48%|████████████████████▎                     | 15/31 [00:12<00:15,  1.01it/s]

Take: 391.64 ms
<class 'numpy.ndarray'> uint8 (200, 167)
[[0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]]


 52%|█████████████████████▋                    | 16/31 [00:13<00:15,  1.05s/it]

Take: 1251.15 ms
<class 'numpy.ndarray'> uint8 (200, 1024)
[[0 0 1 ... 0 1 1]
 [0 0 1 ... 0 0 1]
 [0 0 1 ... 0 0 1]
 ...
 [0 0 1 ... 0 0 1]
 [0 0 1 ... 1 0 1]
 [0 0 1 ... 0 0 1]]


 55%|███████████████████████                   | 17/31 [00:15<00:15,  1.08s/it]

Take: 1174.15 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[1 0 0 ... 1 1 1]
 [1 0 0 ... 1 1 1]
 [1 0 0 ... 0 1 1]
 ...
 [1 0 1 ... 1 1 1]
 [1 0 1 ... 1 1 1]
 [1 0 1 ... 1 1 1]]
Take: 76.25 ms
<class 'numpy.ndarray'> uint32 (200, 42)
[[26  0  0 ...  0  0  0]
 [26  0  0 ...  0  2  1]
 [26  0  0 ...  0  0  0]
 ...
 [34  0  0 ...  0  2  1]
 [33  0  0 ...  0  2  1]
 [27  0  0 ...  0  2  1]]


 65%|███████████████████████████               | 20/31 [04:05<07:23, 40.35s/it]

Take: 230641.02 ms
<class 'numpy.ndarray'> float32 (200, 1613)
[[ 26.231901  19.772808   0.       ... 192.        11.25       8.194445]
 [ 30.572     23.001709   0.       ... 233.        12.833333   9.111111]
 [ 27.32103   21.888456   0.       ... 210.        12.902778   8.131945]
 ...
 [ 37.01684   25.19043    0.       ... 275.        13.944445  10.805555]
 [ 36.976402  25.503395   0.       ... 276.        13.944445  10.833333]
 [ 33.538326  25.528872   0.       ... 253.        14.444445  10.      ]]


 68%|████████████████████████████▍             | 21/31 [04:06<05:25, 32.54s/it]

Take: 863.47 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 1 0 ... 1 1 0]
 [0 1 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 1 0 ... 1 1 0]
 [0 1 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]


 71%|█████████████████████████████▊            | 22/31 [04:50<05:15, 35.01s/it]

Take: 43720.44 ms
<class 'numpy.ndarray'> uint8 (200, 39972)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


 74%|███████████████████████████████▏          | 23/31 [04:50<03:33, 26.70s/it]

Take: 312.74 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


 77%|████████████████████████████████▌         | 24/31 [04:55<02:27, 21.05s/it]

Take: 4525.36 ms
<class 'numpy.ndarray'> uint8 (200, 881)
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]


 84%|███████████████████████████████████▏      | 26/31 [04:55<01:01, 12.31s/it]

Take: 587.81 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 0 0 ... 0 0 1]
 [1 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 ...
 [1 0 1 ... 0 0 1]
 [1 0 0 ... 1 0 1]
 [1 1 1 ... 1 1 1]]


100%|██████████████████████████████████████████| 31/31 [04:58<00:00,  9.62s/it]

Take: 2124.91 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Take: 185.57 ms
<class 'numpy.ndarray'> uint8 (200, 2048)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]





Unnamed: 0,class,dtype,shape,error,take
0,<class 'abc.ABCMeta'>,uint8,"(200, 2048)",,128.21
1,<class 'abc.ABCMeta'>,float64,"(200, 192)",,102.16
2,<class 'abc.ABCMeta'>,uint8,"(200, 512)",,591.92
3,<class 'abc.ABCMeta'>,,,Passed data must be molecules (rdkit.Chem.rdCh...,
4,<class 'abc.ABCMeta'>,uint8,"(200, 2048)",,94.46
5,<class 'abc.ABCMeta'>,float64,"(200, 315)",,134.01
6,<class 'abc.ABCMeta'>,float64,"(200, 79)",,341.23
7,<class 'abc.ABCMeta'>,uint8,"(200, 85)",,192.67
8,<class 'abc.ABCMeta'>,,,Passed data must be molecules (rdkit.Chem.rdCh...,
9,<class 'abc.ABCMeta'>,uint8,"(200, 110)",,238.62


In [54]:
24 * 60 * 60 / 100_000_000 * 200 * 1000

172.79999999999998

In [51]:
df

Unnamed: 0,class,dtype,shape,error,take
0,AtomPairFingerprint,uint8,"(200, 2048)",,128.21
1,AutocorrFingerprint,float64,"(200, 192)",,102.16
2,AvalonFingerprint,uint8,"(200, 512)",,591.92
3,E3FPFingerprint,,,Passed data must be molecules (rdkit.Chem.rdCh...,
4,ECFPFingerprint,uint8,"(200, 2048)",,94.46
5,ERGFingerprint,float64,"(200, 315)",,134.01
6,EStateFingerprint,float64,"(200, 79)",,341.23
7,FunctionalGroupsFingerprint,uint8,"(200, 85)",,192.67
8,GETAWAYFingerprint,,,Passed data must be molecules (rdkit.Chem.rdCh...,
9,GhoseCrippenFingerprint,uint8,"(200, 110)",,238.62


In [64]:
df[df.error.str.len() > 0]['error'].to_list()

['Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must have conf_id property set. You can use ConformerGenerator to add them.',
 'Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must have conf_id property set. You can use ConformerGenerator to add them.',
 'Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must have conf_id property set. You can use ConformerGenerator to add them.',
 'Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must have conf_id property set. You can use ConformerGenerator to add them.',
 'Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must have conf_id property set. You can use ConformerGenerator to add them.',
 'Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must have conf_id property set. You can use ConformerGenerator to add them.',
 'Passed data must be molecules (rdkit.Chem.rdChem.Mol instances) and each must ha

In [14]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [15]:
def replace_dy(smiles):
    mol = Chem.MolFromSmiles(smiles)
    #Create a mol object to replace the Dy atom with.
    new_attachment = Chem.MolFromSmiles('C')
    #Get the pattern for the Dy atom
    dy_pattern = Chem.MolFromSmiles('[Dy]')
    #This returns a tuple of all possible replacements, but we know there will only be one.
    new_mol = AllChem.ReplaceSubstructs(mol, dy_pattern, new_attachment)[0]
    #Good idea to clean it up
    Chem.SanitizeMol(new_mol)
    # Since you want 3D mols later, I'd suggest adding hydrogens. Note: this takes up a lot more memory for the obj.
    # Chem.AddHs(new_mol)
    return Chem.MolToSmiles(new_mol, canonical=True)

In [32]:
from skfp.fingerprints import E3FPFingerprint
from skfp.preprocessing import MolFromSmilesTransformer, ConformerGenerator
smiles = test_df[:10_000, 'molecule']
smiles = [replace_dy(e) for e in smiles]
fp = E3FPFingerprint(n_jobs=-1)
fp

In [33]:
%%time
mol_from_smiles = MolFromSmilesTransformer()
mols = mol_from_smiles.transform(smiles)
conf_gen = ConformerGenerator()
mols = conf_gen.transform(mols)
ret = fp.transform(mols)
ret

CPU times: user 34min 19s, sys: 7.6 s, total: 34min 27s
Wall time: 35min 47s


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]], dtype=uint8)

In [34]:
36 * 60 / 10_000

0.216

In [28]:
90_000_000 * 0.3 / 3600 / 80

93.75

In [38]:
90_000_000 * 0.216 / 3600 / 80 / 24

2.8125