-
Notifications
You must be signed in to change notification settings - Fork 5
/
smiles.py
363 lines (303 loc) · 9.48 KB
/
smiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import string
from molecule import Molecule
# TODO: Reorganize class
# TODO: Fix documentation
class Smiles(object):
"""
SMILES (Simplified Molecular Input Line Entry Specification)
This class parses SMILES strings into a collection of tokens that
are easily converted into molecular graphs.
"""
def __init__(self, smilesStr):
"""CTOR."""
self.string = smilesStr
self.tokens = self.tokenizeString(smilesStr)
# TODO: Canonoicalization
# XXX: Perhaps this isn't something I should even manage here...
self._cachedCanonical = None # Should be 'Smiles' instance
def numAtoms(self):
"""Calculates the number of non-H atoms"""
# XXX: This hydrogen-excluded policy WILL cause problems in the
# future if I don't develop a comprehensive protocol for
# handling.
cnt = 0
for x in self.tokens:
x = str(x)
if x.isalpha() and x.upper() != 'h':
cnt += 1
return cnt
def __str__(self):
"""String Representation."""
return "<Smiles: %s>" % self.string
@staticmethod
def tokenizeString(smileStr):
"""
Convert SMILES string into a list of tokens. Working with
tokens is much more convenient than dealing with a character
array. Nothing is removed from the string, simply grouped.
The following items are multiple characters wide, but constitute
'one token' each:
* Unbracketed Organic set atoms 'Cl' and 'Br'
* Bracketed Inorganics, such as 'Co' and 'Sn'
* Atomic charges (various formats): -, +++, 3-, +2
* Tetrahedral sterochemistry direction notation '@@'
* Digits beyond 9
"""
READ_AHEAD = {
'C': 'Cl',
'B': 'Br'
}
CHARGE = '+-' + string.digits
CHARGE_SIGN = '+-'
# Process string.
tokens = []
pos = 0
inBracket = False
inPercent = False
while pos < len(smileStr):
char = smileStr[pos]
# Current bracket state
if char == '[':
inBracket = True
tokens.append(char)
pos += 1
continue
if char == ']':
inBracket = False
tokens.append(char)
pos += 1
continue
# Current percent label state (for digits > 9)
if char == '%':
inPercent = True
tokens.append(char)
pos += 1
continue
if inPercent and not char.isdigit():
inPercent = False
# Group atom charges.
# Charges only occur in brackets. (TODO: Confirm via spec.)
if char in CHARGE and inBracket:
# Only confirm that this is a charge when a '+' or '-'
# is encountered. Otherwise, digits could mean anything.
# XXX: This algorithm allows for invalid formats such
# as '++5++2' and so forth, but at present I don't care
isCharge = False
chargeStr = ''
for i in range(pos, len(smileStr)):
rd = smileStr[i]
if rd not in CHARGE:
break
elif rd in CHARGE_SIGN:
isCharge = True
chargeStr += rd
if isCharge:
tokens.append(chargeStr)
pos += len(chargeStr)
continue
# Handle two+ character Inorganic set (eg. Co, Sn, etc.)
# We must be VERY careful not to group a bonded hydrogen.
# Examples of special cases to worry about are [OH]
# (Hydroxide) and [nH] (Aromatic Nitrogen bonded to a
# Hydrogen). Note: Inorganic only occur in brackets!
if char in string.uppercase and inBracket:
isInorganic = True
atomStr = ''
for i in range(pos, len(smileStr)):
rd = smileStr[i]
if rd not in string.letters:
break
atomStr += rd
if len(atomStr) > 1:
# Already know first letter is uppercase, but
# all subsequent letters must be lowercase
for x in atomStr[1:]:
if x not in string.lowercase:
isInorganic = False
if isInorganic:
tokens.append(atomStr)
pos += len(atomStr)
continue
# Handle tetrahedral stereochemistry symbol @@
# These only occur in brackets (TODO: Confirm via spec)
if char is '@' and inBracket and pos < len(smileStr)-1:
rd = char + smileStr[pos+1]
if rd == '@@':
tokens.append(rd)
pos += 2
continue
# Handle Br and Cl (Organic Subset)
if char in READ_AHEAD and pos < len(smileStr)-1:
rd = char + smileStr[pos+1]
if rd == READ_AHEAD[char]:
tokens.append(rd)
pos += 2
continue
# Group connectivity labels greater than nine.
# ie, connectivity as in 'c%12cccccc%12'
if char in string.digits and inPercent:
digitStr = ''
for i in range(pos, len(smileStr)):
rd = smileStr[i]
if rd not in string.digits:
break
digitStr += rd
tokens.append(digitStr)
pos += len(digitStr)
continue
# Group isotope digits. Must be handled separately from
# connectivity labels and charges.
# TODO: Verify from spec what happens if labels > 9 and
# isotope are present.
if char in string.digits and inBracket:
digitStr = ''
for i in range(pos, len(smileStr)):
rd = smileStr[i]
if rd not in string.digits:
break
digitStr += rd
tokens.append(digitStr)
pos += len(digitStr)
continue
# Catch other cases
tokens.append(char)
pos += 1
return tokens
def toMolecule(self):
"""Convert a SMILES string into a adjacency matrix representation."""
# First, we must convert the input string into a proper queue
# Organic subset: B, C, N, O, P, S, F, Cl, Br, I
# Everything else, including H: must be specified in brackets.
ATOMS = ['B', 'C', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I']
ATOMS_UPPER = map(lambda x: x.upper(), ATOMS)
# Additional Symbols
BRANCHING = '()' # Branch Start & End
BONDS = '=#' # Double and Triple Bonds
CONNECTIVE = '%' # Connectivity beyond '9' -- TODO NOT YET HANDLED.
CHARGE = '+-' # Cation/anion charge. Only occur in brackets.
queue = self.tokens
sz = self.numAtoms()
# Output data
data = {
'connectMat': [[False for x in range(sz)] for y in range(sz)],
'bondOrderMat': [[False for x in range(sz)] for y in range(sz)],
'types': [None for x in range(sz)],
'charges': [0 for x in range(sz)],
'isotopes': [0 for x in range(sz)],
}
# Symbol type tests
#def is_atom(sym): return sym.upper() in ATOMS_UPPER
def is_atom(sym): return type(sym) is str and sym.isalpha()
def is_bond(sym): return sym in BONDS
def is_closure(sym): return sym.isdigit() # TODO: Not a good test
def is_bond_order(sym): return sym in '=#'
def is_charge(sym): return len(filter(lambda x: x in CHARGE, sym)) > 0
# Make note of the connection beween two atoms.
def connect(a1, a2, bondOrder=1):
data['connectMat'][a1][a2] = True
data['connectMat'][a2][a1] = True
data['bondOrderMat'][a1][a2] = bondOrder
data['bondOrderMat'][a2][a1] = bondOrder
# Label the atom type
def label(a, name, isotope = 0):
data['types'][a] = name
data['isotopes'][a] = isotope
# Parse the charge. Examples: +, --, 2+, -3
# TODO: Make more compact, and more valid to spec.
def parse_charge(chargeStr):
charge = 0
hasDigit = False
for x in chargeStr:
if x.isdigit():
hasDigit = True
break
# Type 1: 3+, -2
if hasDigit:
charge = int(chargeStr.replace('+', '').replace('-', ''))
if '-' in chargeStr:
charge *= -1
return charge
# Type 2: +++, --
for x in chargeStr:
if x is '-':
charge -= 1
elif x is '+':
charge += 1
return charge
# Keep track of state during parsing
atomCnt = 0 # Total num atoms
atomPrev = 0 # Previous atom
bondOrder = 1 # Bond order: 1, 2, 3
isotope = 0 # Atom isotope
branchStack = [] # Branching, eg. C(C)(C)C
ringClosures = {} # Keep track of cycles, eg. c1ccccc1
# Bracket state. Brackets denote isotope, charge, inorganics...
inBrackets = False # Currently in brackets
inBracketsAtomFound = False # Atom encountered in brackets
for sym in queue:
if is_atom(sym):
atomId = atomCnt
atomCnt += 1
label(atomId, sym, isotope)
isotope = 0
if inBrackets:
# Once atom found, isotope can't be set.
inBracketsAtomFound = True
if atomId != 0:
# TODO: Conjugated systems bond order = 1.5
connect(atomPrev, atomId, bondOrder)
bondOrder = 1
atomPrev = atomId
# Handle bracket state
if sym == '[':
inBrackets = True
inBracketsAtomFound = False
continue
if sym == ']':
inBrackets = False
inBracketsAtomFound = False
continue
# Handle branching
if sym == '(':
branchStack.append(atomPrev)
continue
if sym == ')':
atomPrev = branchStack.pop()
continue
if is_closure(sym):
# TODO: Won't handle > 9
num = int(sym)
if num not in ringClosures:
ringClosures[num] = atomPrev
else:
# TODO: Conjugated systems bond order = 1.5
connect(atomPrev, ringClosures[num], bondOrder)
bondOrder = 1
if is_bond_order(sym):
if sym == '=':
bondOrder = 2
else:
bondOrder = 3
continue
if is_charge(sym) and inBrackets:
# The '-' symbol is also used for a few limited cases
# of single bond representation.
data['charges'][atomId] = parse_charge(sym)
continue
# Isotope can only be set in brackets, before the atom.
if sym.isdigit() and inBrackets and not inBracketsAtomFound:
isotope = sym
# End queue processing, build and return Molecule object.
return Molecule(data['types'], data['bondOrderMat'],
connectMat=data['connectMat'], charges=data['charges'],
isotopes=data['isotopes'], smiles=self)
def smiles_to_molecule(smiles):
"""Function to return a MolMatrix from a smiles string without an
intermediate."""
# XXX: Do I really need to keep this?
if type(smiles) == str:
smiles = Smiles(smiles)
return smiles.toMolecule()
def molecule_to_smiles(matrix):
# TODO: Need to convert back into a smiles expression.
pass