Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug for Mol2VecFingerprint #2242

Merged
merged 1 commit into from Oct 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 2 additions & 15 deletions deepchem/feat/molecule_featurizers/mol2vec_fingerprint.py
Expand Up @@ -42,8 +42,7 @@ class Mol2VecFingerprint(MolecularFeaturizer):
def __init__(self,
pretrain_model_path: Optional[str] = None,
radius: int = 1,
unseen: str = 'UNK',
gather_method: str = 'sum'):
unseen: str = 'UNK'):
"""
Parameters
----------
Expand All @@ -56,9 +55,6 @@ def __init__(self,
github repository.
unseen: str, optional (default 'UNK')
The string to used to replace uncommon words/identifiers while training.
gather_method: str, optional (default 'sum')
How to aggregate vectors of identifiers are extracted from Mol2vec.
'sum' or 'mean' is supported.
"""
try:
from gensim.models import word2vec
Expand All @@ -68,7 +64,6 @@ def __init__(self,

self.radius = radius
self.unseen = unseen
self.gather_method = gather_method
self.sentences2vec = sentences2vec
self.mol2alt_sentence = mol2alt_sentence
if pretrain_model_path is None:
Expand Down Expand Up @@ -98,13 +93,5 @@ def _featurize(self, mol: RDKitMol) -> np.ndarray:
1D array of mol2vec fingerprint. The default length is 300.
"""
sentence = self.mol2alt_sentence(mol, self.radius)
vec_identifiers = self.sentences2vec(
sentence, self.model, unseen=self.unseen)
if self.gather_method == 'sum':
feature = np.sum(vec_identifiers, axis=0)
elif self.gather_method == 'mean':
feature = np.mean(vec_identifiers, axis=0)
else:
raise ValueError(
'Not supported gather_method type. Please set "sum" or "mean"')
feature = self.sentences2vec([sentence], self.model, unseen=self.unseen)[0]
return feature
10 changes: 2 additions & 8 deletions deepchem/feat/tests/test_mol2vec_fingerprint.py
@@ -1,7 +1,5 @@
import unittest

import numpy as np

from deepchem.feat import Mol2VecFingerprint


Expand All @@ -23,9 +21,5 @@ def test_mol2vec_fingerprint(self):
Test simple fingerprint.
"""
featurizer = Mol2VecFingerprint()
feature_sum = featurizer([self.mol])
assert feature_sum.shape == (1, 300)
featurizer = Mol2VecFingerprint(gather_method='mean')
feature_mean = featurizer([self.mol])
assert feature_mean.shape == (1, 300)
assert not np.allclose(feature_sum, feature_mean)
feature = featurizer([self.mol])
assert feature.shape == (1, 300)
2 changes: 1 addition & 1 deletion requirements.yml
Expand Up @@ -20,4 +20,4 @@ dependencies:
- pymatgen
- simdna
- xgboost
- -e git+https://github.com/samoturk/mol2vec#egg=mol2vec
- git+https://github.com/samoturk/mol2vec