In [1]:
%%time
from FPSim2 import FPSim2Engine
from isim_comp import gen_sim_dict
import numpy as np

# ECFP4 (Morgan, radius 2) 2048 bits
# from: https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/chembl_33.h5
fp_filename = "chembl_33.h5"
fpe = FPSim2Engine(fp_filename)
print(fpe.fps.shape)

(2372674, 34)
CPU times: user 3.3 s, sys: 1.06 s, total: 4.37 s
Wall time: 2.72 s


# FPSim2 packs fingerprints in groups of 64 bits, reshape the array
memory inefficient, each uint64 converted to 64 uint8

In [2]:
%%time
# first and last columns are the id and the popcount
fps = fpe.fps[:, 1:-1].view("uint8")
bits = np.unpackbits(fps[:, np.newaxis], axis=1).ravel()
fps = bits.reshape(int(bits.size / (fps.shape[1]*8)), fps.shape[1]*8)
print(fps.shape)

(2372674, 2048)
CPU times: user 5.55 s, sys: 1.2 s, total: 6.75 s
Wall time: 6.82 s


In [3]:
%%time
gen_sim_dict(fps)

CPU times: user 3.72 s, sys: 11.2 ms, total: 3.73 s
Wall time: 3.76 s


{'AC': 0.8723258517530809,
 'BUB': 0.649187629177515,
 'Fai': 0.48261521369877647,
 'Gle': 0.19851100403624725,
 'Ja': 0.27088015980606983,
 'JT': 0.11019273749715505,
 'RT': 0.9236614311048624,
 'RR': 0.004914421504193711,
 'SM': 0.9603160058933592,
 'SS1': 0.05830898191766747,
 'SS2': 0.9797563280678535}

# Same but not duplicating FP array in memory
Slower overall but memory efficient

In [5]:
%%time
sc = np.zeros(((fpe.fps.shape[1]-2) * 64), dtype='uint64')
for m in fpe.fps[:, 1:-1].view("uint8"):
    sc += np.unpackbits(m[:, np.newaxis], axis=1).ravel()

gen_sim_dict(sc, n_objects=fpe.fps.shape[0])

CPU times: user 15.9 s, sys: 41.6 ms, total: 16 s
Wall time: 16 s


{'AC': 0.8723258517530809,
 'BUB': 0.649187629177515,
 'Fai': 0.48261521369877647,
 'Gle': 0.19851100403624725,
 'Ja': 0.27088015980606983,
 'JT': 0.11019273749715505,
 'RT': 0.9236614311048624,
 'RR': 0.004914421504193711,
 'SM': 0.9603160058933592,
 'SS1': 0.05830898191766747,
 'SS2': 0.9797563280678535}