Skip to content

Commit

Permalink
Allow fingerprint querying
Browse files Browse the repository at this point in the history
  • Loading branch information
roman-bushuiev authored and eloyfelix committed Jul 18, 2023
1 parent 2948019 commit 814bbb5
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 20 deletions.
15 changes: 8 additions & 7 deletions FPSim2/FPSim2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import concurrent.futures as cf
from rdkit.DataStructs import ExplicitBitVect
from .io.chem import get_bounds_range
from typing import Callable, Any, Tuple, Union
from .FPSim2lib import (
Expand Down Expand Up @@ -84,14 +85,14 @@ def __init__(
self.empty_subs = np.ndarray((0,), dtype="<u4")

def similarity(
self, query_string: str, threshold: float, n_workers=1
self, query: Union[str, ExplicitBitVect], threshold: float, n_workers=1
) -> np.ndarray:
"""Runs a Tanimoto search.
Parameters
----------
query_string : str
SMILES, InChI or molblock.
query : Union[str, ExplicitBitVect]
SMILES, InChI, molblock or fingerprint as ExplicitBitVect.
threshold: float
Similarity threshold.
Expand All @@ -109,21 +110,21 @@ def similarity(
"Load the fingerprints into memory before running a in memory search"
)

np_query = self.load_query(query_string)
query = self.load_query(query)
bounds = get_bounds_range(
np_query, threshold, None, None, self.popcnt_bins, "tanimoto"
query, threshold, None, None, self.popcnt_bins, "tanimoto"
)

if not bounds:
results = self.empty_sim
else:
if n_workers == 1:
results = TanimotoSearch(np_query, self.fps, threshold, *bounds)
results = TanimotoSearch(query, self.fps, threshold, *bounds)
else:
results = self._parallel(
search_func=TanimotoSearch,
executor=cf.ThreadPoolExecutor,
query=np_query,
query=query,
db=self.fps,
args=(threshold,),
bounds=bounds,
Expand Down
20 changes: 13 additions & 7 deletions FPSim2/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from abc import ABC, abstractmethod
from .io.chem import load_molecule, build_fp
from .io.chem import load_molecule, build_fp, process_fp
from .io.backends import PyTablesStorageBackend
from .io.backends import SqlaStorageBackend
from sqlalchemy import create_mock_engine
from rdkit.DataStructs import ExplicitBitVect
from typing import Union
import numpy as np


Expand Down Expand Up @@ -67,21 +69,25 @@ def fp_params(self):
def rdkit_ver(self):
return self.storage.rdkit_ver

def load_query(self, query_string: str) -> np.ndarray:
"""Loads the query molecule from SMILES, molblock or InChI.
def load_query(self, query: Union[str, ExplicitBitVect]) -> np.ndarray:
"""Loads the query fingerprint from SMILES, molblock, InChI or ExplicitBitVect fingerprint.
Parameters
----------
query_string : str
SMILES, InChi or molblock.
query : Union[str, ExplicitBitVect]
SMILES, InChi, molblock or fingerprint as ExplicitBitVect.
Returns
-------
query : numpy array
Numpy array query molecule.
"""
rdmol = load_molecule(query_string)
fp = build_fp(rdmol, self.fp_type, self.fp_params, 0)

if isinstance(query, ExplicitBitVect):
fp = process_fp(query, 0)
else:
rdmol = load_molecule(query)
fp = build_fp(rdmol, self.fp_type, self.fp_params, 0)
return np.array(fp, dtype=np.uint64)

@abstractmethod
Expand Down
16 changes: 10 additions & 6 deletions FPSim2/io/chem.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from FPSim2.FPSim2lib.utils import BitStrToIntList, PyPopcount
from collections.abc import Iterable
from rdkit.Chem import rdMolDescriptors
from rdkit.DataStructs import ExplicitBitVect
from rdkit.Avalon import pyAvalonTools
from rdkit import Chem
import numpy as np
Expand Down Expand Up @@ -78,16 +79,19 @@
}


def rdmol_to_efp(rdmol: Chem.Mol, fp_func: str, fp_params: Dict[str, Any]) -> List[int]:
fp = FP_FUNCS[fp_func](rdmol, **fp_params)
return BitStrToIntList(fp.ToBitString())
def rdmol_to_efp(rdmol: Chem.Mol, fp_func: str, fp_params: Dict[str, Any]) -> ExplicitBitVect:
return FP_FUNCS[fp_func](rdmol, **fp_params)


def build_fp(rdmol, fp_type, fp_params, mol_id):
efp = rdmol_to_efp(rdmol, fp_type, fp_params)
popcnt = PyPopcount(np.array(efp, dtype=np.uint64))
fp = (mol_id, *efp, popcnt)
return fp
return process_fp(efp, mol_id)


def process_fp(fp, mol_id):
fp = BitStrToIntList(fp.ToBitString())
popcnt = PyPopcount(np.array(fp, dtype=np.uint64))
return mol_id, *fp, popcnt


def load_molecule(mol_string: str) -> Chem.Mol:
Expand Down
25 changes: 25 additions & 0 deletions tests/test_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from FPSim2 import FPSim2Engine
from rdkit import Chem, DataStructs
from rdkit.Chem import rdMolDescriptors
from FPSim2.io import create_db_file
import numpy as np
import pytest
import math
Expand Down Expand Up @@ -176,6 +178,29 @@ def test_load_fps_sort():
assert fpe.popcnt_bins == fpe2.popcnt_bins


def test_query_fp():
query = 'CC(=O)Oc1ccccc1C(=O)O'
radius = 2
n_bits = 4096
index_pth = os.path.join(TESTS_DIR, 'data/test_fp_query_index.h5')
threshold = 0.01
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(query), radius=radius, nBits=n_bits)

create_db_file([
query,
'C1C(N=C(S1)C2=NC3=C(S2)C=C(C=C3)O)C(=O)O',
'[H][C@@]12[C@@H](CC[C@]3(C)C1C(C)=CC[C@@]23[H])C(C)C'
], index_pth, 'Morgan', {'radius': radius, 'nBits': n_bits})
fpe = FPSim2Engine(index_pth)

result_smi = fpe.similarity(query, threshold)
result_fp = fpe.similarity(fp, threshold)
os.remove(index_pth)

assert result_fp[0][0] == result_fp[0][1] == 1
assert (result_smi == result_fp).all()


@pytest.mark.parametrize("n_workers", (1, 2, 4))
def test_similarity(n_workers):
in_file = os.path.join(TESTS_DIR, "data/test.h5")
Expand Down

0 comments on commit 814bbb5

Please sign in to comment.