/
mol2vec_fingerprint.py
203 lines (171 loc) · 7.41 KB
/
mol2vec_fingerprint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
from os import path
from typing import Optional
import numpy as np
from rdkit.Chem import AllChem
from deepchem.utils import download_url, get_data_dir, untargz_file
from deepchem.utils.typing import RDKitMol
from deepchem.feat.base_classes import MolecularFeaturizer
DEFAULT_PRETRAINED_MODEL_URL = 'https://deepchemdata.s3-us-west-1.amazonaws.com/trained_models/mol2vec_model_300dim.tar.gz'
def _mol2alt_sentence(mol, radius):
"""Same as mol2sentence() except it only returns the alternating sentence
Calc6ulates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string).
Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii
combined.
NOTE: Words are ALWAYS reordered according to atom order in the input mol object.
NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller
Parameters
----------
mol : rdkit.Chem.rdchem.Mol
radius : float
Fingerprint radius
Returns
-------
list
alternating sentence
combined
"""
# Copied from https://github.com/samoturk/mol2vec/blob/850d944d5f48a58e26ed0264332b5741f72555aa/mol2vec/features.py#L129-L168
radii = list(range(int(radius) + 1))
info = {}
_ = AllChem.GetMorganFingerprint(
mol, radius,
bitInfo=info) # info: dictionary identifier, atom_idx, radius
mol_atoms = [a.GetIdx() for a in mol.GetAtoms()]
dict_atoms = {x: {r: None for r in radii} for x in mol_atoms}
for element in info:
for atom_idx, radius_at in info[element]:
dict_atoms[atom_idx][
radius_at] = element # {atom number: {fp radius: identifier}}
# merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc.
identifiers_alt = []
for atom in dict_atoms: # iterate over atoms
for r in radii: # iterate over radii
identifiers_alt.append(dict_atoms[atom][r])
alternating_sentence = map(str, [x for x in identifiers_alt if x])
return list(alternating_sentence)
class Mol2VecFingerprint(MolecularFeaturizer):
"""Mol2Vec fingerprints.
This class convert molecules to vector representations by using Mol2Vec.
Mol2Vec is an unsupervised machine learning approach to learn vector representations
of molecular substructures and the algorithm is based on Word2Vec, which is
one of the most popular technique to learn word embeddings using neural network in NLP.
Please see the details from [1]_.
The Mol2Vec requires the pretrained model, so we use the model which is put on the mol2vec
github repository [2]_. The default model was trained on 20 million compounds downloaded
from ZINC using the following paramters.
- radius 1
- UNK to replace all identifiers that appear less than 4 times
- skip-gram and window size of 10
- embeddings size 300
References
----------
.. [1] Jaeger, Sabrina, Simone Fulle, and Samo Turk. "Mol2vec: unsupervised machine learning
approach with chemical intuition." Journal of chemical information and modeling 58.1 (2018): 27-35.
.. [2] https://github.com/samoturk/mol2vec/
Note
----
This class requires mol2vec to be installed.
Examples
--------
>>> import deepchem as dc
>>> from rdkit import Chem
>>> smiles = ['CCC']
>>> featurizer = dc.feat.Mol2VecFingerprint()
>>> features = featurizer.featurize(smiles)
>>> type(features)
<class 'numpy.ndarray'>
>>> features[0].shape
(300,)
"""
def __init__(self,
pretrain_model_path: Optional[str] = None,
radius: int = 1,
unseen: str = 'UNK'):
"""
Parameters
----------
pretrain_file: str, optional
The path for pretrained model. If this value is None, we use the model which is put on
github repository (https://github.com/samoturk/mol2vec/tree/master/examples/models).
The model is trained on 20 million compounds downloaded from ZINC.
radius: int, optional (default 1)
The fingerprint radius. The default value was used to train the model which is put on
github repository.
unseen: str, optional (default 'UNK')
The string to used to replace uncommon words/identifiers while training.
"""
try:
from gensim.models import word2vec
except ModuleNotFoundError:
raise ImportError("This class requires mol2vec to be installed.")
self.radius = radius
self.unseen = unseen
self.mol2alt_sentence = _mol2alt_sentence
if pretrain_model_path is None:
data_dir = get_data_dir()
pretrain_model_path = path.join(data_dir,
'mol2vec_model_300dim.pkl')
if not path.exists(pretrain_model_path):
targz_file = path.join(data_dir, 'mol2vec_model_300dim.tar.gz')
if not path.exists(targz_file):
download_url(DEFAULT_PRETRAINED_MODEL_URL, data_dir)
untargz_file(path.join(data_dir, 'mol2vec_model_300dim.tar.gz'),
data_dir)
# load pretrained models
self.model = word2vec.Word2Vec.load(pretrain_model_path)
def sentences2vec(self, sentences: list, model, unseen=None) -> np.ndarray:
"""Generate vectors for each sentence (list) in a list of sentences. Vector is simply a
sum of vectors for individual words.
Parameters
----------
sentences : list, array
List with sentences
model : word2vec.Word2Vec
Gensim word2vec model
unseen : None, str
Keyword for unseen words. If None, those words are skipped.
https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
Returns
-------
np.array
"""
keys = set(model.wv.key_to_index.keys())
vec = []
if unseen:
unseen_vec = model.wv.get_vector(unseen)
for sentence in sentences:
if unseen:
vec.append(
sum([
model.wv.get_vector(y) if y in set(sentence) &
keys else unseen_vec for y in sentence
]))
else:
vec.append(
sum([
model.wv.get_vector(y)
for y in sentence
if y in set(sentence) & keys
]))
return np.array(vec)
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray:
"""
Calculate Mordred descriptors.
Parameters
----------
datapoint: rdkit.Chem.rdchem.Mol
RDKit Mol object
Returns
-------
np.ndarray
1D array of mol2vec fingerprint. The default length is 300.
"""
if 'mol' in kwargs:
datapoint = kwargs.get("mol")
raise DeprecationWarning(
'Mol is being phased out as a parameter, please pass "datapoint" instead.'
)
sentence = self.mol2alt_sentence(datapoint, self.radius)
feature = self.sentences2vec([sentence], self.model,
unseen=self.unseen)[0]
return feature