In [1]:
%%time
from FPSim2 import FPSim2Engine
from div_isim import diversity
import numpy as np

# ECFP4 (Morgan, radius 2) 2048 bits
# from: https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_33.h5
fp_filename = "chembl_33.h5"
fpe = FPSim2Engine(fp_filename)
print(fpe.fps.shape)

(2372674, 34)
CPU times: user 3.32 s, sys: 1.06 s, total: 4.38 s
Wall time: 2.76 s


In [2]:
%%time
# first and last columns are the id and the popcount
fps = fpe.fps[:, 1:-1].view("uint8")
# FPSim2 packs fingerprints in groups of 64 bits, need to reshape the array
bits = np.unpackbits(fps[:, np.newaxis], axis=1).ravel()
fps = bits.reshape(int(bits.size / fpe.fp_params['nBits']), fpe.fp_params['nBits'])
print(fps.shape)

(2372674, 2048)
CPU times: user 5.68 s, sys: 1.12 s, total: 6.81 s
Wall time: 6.84 s


In [3]:
# It may take too long with whole ChEMBL, try with 10k random compounds
idx = np.random.randint(fps.shape[0], size=10000)
div_fps = fps[idx,:]

In [4]:
%%time
res = diversity(div_fps, 1)

CPU times: user 2min 4s, sys: 322 ms, total: 2min 4s
Wall time: 2min 4s


In [5]:
# translate the indexes to ChEMBL molregnos stored in FPSim2 database
fpe.fps[:, 0][idx[res]]

array([2284814, 1673875,  259214,  515591, 2226663,  657222,  314882,
       1676149,  191333,  755686, 1673506, 1326212, 1363206,  429763,
         67862, 1315841,  427931, 1316033,  374548,  453497, 1672488,
       1480188,  284208, 1447040,  361880,  655895,   70448,  428038,
       1214938,   66232, 1308700, 1328183,  230211, 2396959,  711923,
       1382268, 1327956, 1734715,  786235, 2266285,   27855,  655919,
       1829973, 2454489,  504216,  364691, 1325513,  115808, 1280526,
        991320, 1864422,  811727, 1302204, 1329034, 2359730, 1294491,
        428773,  761054,  275110,  108683, 1313125,  129516,  250973,
        323721, 1829971,  144030,   33255, 1767997,  955205, 1475230,
        259242,  154886, 1328292, 2185236,  301100, 1058050,  694674,
        109171,  574723,   19469, 1383547, 1274393,  697333, 1488064,
        669070, 1677857,  329531, 2409131, 1160335,  723538,  700657,
       1231994, 1297006, 1736551,   29366,  882240,  775323, 2651941,
       1339541, 1291