Skip to content

Commit

Permalink
fixed bad structures. (#186)
Browse files Browse the repository at this point in the history
smiles parser fixes:
* aromatic Te
* ring closures: %123 > 12 and 3
* cis|trans on bonds connected to aromatic rings (ad-hoc for invalid aromaticity)

kekule fixes:
* double bond in ring ad-hoc
* N+ quinone-like
* As atom support
* aromatic S+ support
* rule based fixes of bad aromaticity with bad standardization.
  • Loading branch information
stsouko committed Jul 6, 2021
1 parent 28aa9a5 commit 597c648
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 14 deletions.
123 changes: 115 additions & 8 deletions CGRtools/algorithms/aromatics.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,15 +312,15 @@ def __prepare_rings(self: 'MoleculeContainer'):
rings = defaultdict(list) # aromatic skeleton
pyroles = set()

double_bonded = set()
double_bonded = defaultdict(list)
triple_bonded = set()
for n, m_bond in bonds.items():
for m, bond in m_bond.items():
bo = bond.order
if bo == 4:
rings[n].append(m)
elif bo == 2:
double_bonded.add(n)
double_bonded[n].append(m)
elif bo == 3:
triple_bonded.add(n)

Expand All @@ -334,21 +334,29 @@ def __prepare_rings(self: 'MoleculeContainer'):
if set(r).issubset(rings):
n, *_, m = r
if n not in rings[m]: # fix invalid structures: c1ccc-cc1
# remove inner ring double bonds: c1ccc=cc1
if n in double_bonded and m in double_bonded and m in double_bonded[n]:
double_bonded[n].remove(m)
double_bonded[m].remove(n)
rings[m].append(n)
rings[n].append(m)
elif m in copy_rings[n]:
copy_rings[n].remove(m)
copy_rings[m].remove(n)
for n, m in zip(r, r[1:]):
if n not in rings[m]:
if n in double_bonded and m in double_bonded and m in double_bonded[n]:
double_bonded[n].remove(m)
double_bonded[m].remove(n)
rings[m].append(n)
rings[n].append(m)
elif m in copy_rings[n]:
copy_rings[n].remove(m)
copy_rings[m].remove(n)

if any(len(ms) not in (2, 3) for ms in rings.values()):
raise InvalidAromaticRing('not in ring aromatic bond or hypercondensed rings')
raise InvalidAromaticRing('not in ring aromatic bond or hypercondensed rings: '
f'{{{", ".join(str(n) for n, ms in rings.items() if len(ms) not in (2, 3))}}}')

# fix invalid smiles: c1ccccc1c2ccccc2 instead of c1ccccc1-c2ccccc2
seen = set()
Expand All @@ -361,11 +369,16 @@ def __prepare_rings(self: 'MoleculeContainer'):
rings[m].remove(n)
bonds[n][m]._Bond__order = 1

double_bonded &= rings.keys()
# get double bonded ring atoms
double_bonded = {n for n, ms in double_bonded.items() if ms and n in rings}
if any(len(rings[n]) != 2 for n in double_bonded): # double bonded never condensed
raise InvalidAromaticRing('quinone valence error')
if any(atoms[n].atomic_number not in (6, 15, 16, 34, 52) or charges[n] for n in double_bonded):
raise InvalidAromaticRing('quinone should be neutral S, Se, Te, C, P atom')
for n in double_bonded:
if atoms[n].atomic_number == 7:
if charges[n] != 1:
raise InvalidAromaticRing('quinone should be charged N atom')
elif atoms[n].atomic_number not in (6, 15, 16, 33, 34, 52) or charges[n]:
raise InvalidAromaticRing('quinone should be neutral S, Se, Te, C, P, As atom')

for n in rings:
an = atoms[n].atomic_number
Expand All @@ -389,7 +402,7 @@ def __prepare_rings(self: 'MoleculeContainer'):
raise InvalidAromaticRing
else:
raise InvalidAromaticRing
elif an in (7, 15):
elif an in (7, 15, 33):
if ac == 0: # pyrole or pyridine. include radical pyrole
if radicals[n]:
if ab != 2:
Expand All @@ -408,7 +421,7 @@ def __prepare_rings(self: 'MoleculeContainer'):
double_bonded.add(n)
elif ah:
raise InvalidAromaticRing
elif ab != 4 or an != 15: # P(V) in ring
elif ab != 4 or an not in (15, 33): # P(V) in ring
raise InvalidAromaticRing
elif ac == -1: # pyrole only
if ab != 2 or radicals[n]:
Expand Down Expand Up @@ -449,6 +462,8 @@ def __prepare_rings(self: 'MoleculeContainer'):
double_bonded.add(n)
elif ac != 1:
raise InvalidAromaticRing('S, Se, Te cation in benzene like ring expected')
elif ab == 3 and ac == 1 and not radicals[n]:
double_bonded.add(n)
else:
raise InvalidAromaticRing('S, Se, Te hypervalent ring')
elif an == 5: # boron
Expand Down Expand Up @@ -708,6 +723,98 @@ def __bad_rings_rules(self):
bonds_fix = ((1, 2, 1),)
rules.append((q, atom_fix, bonds_fix))

#
# : [S+] : >> : S :
# | \\
# [O-] O
#
q = query.QueryContainer()
q.add_atom('S', neighbors=3, hybridization=4, charge=1)
q.add_atom('O', neighbors=1, charge=-1)
q.add_bond(1, 2, 1)
atom_fix = {1: {'_charges': 0}, 2: {'_charges': 0, '_hybridizations': 2}}
bonds_fix = ((1, 2, 2),)
rules.append((q, atom_fix, bonds_fix))

#
# [O-]-N:C:C:[N+]=O
#
q = query.QueryContainer()
q.add_atom('O', neighbors=1, charge=-1)
q.add_atom('N', neighbors=3)
q.add_atom('C')
q.add_atom('C')
q.add_atom('N', neighbors=3, charge=1)
q.add_atom('O', neighbors=1)
q.add_bond(1, 2, 1)
q.add_bond(2, 3, 4)
q.add_bond(3, 4, 4)
q.add_bond(4, 5, 4)
q.add_bond(5, 6, 2)
atom_fix = {2: {'_charges': 1}, 6: {'_charges': -1}}
bonds_fix = ((5, 6, 1),)
rules.append((q, atom_fix, bonds_fix))

#
# N : A : N - ?
# : :
# C # C
q = query.QueryContainer()
q.add_atom('N', neighbors=2)
q.add_atom('C', neighbors=2)
q.add_atom('C', neighbors=2)
q.add_atom('N', neighbors=(2, 3))
q.add_atom(ListElement(['C', 'N']))
q.add_bond(1, 2, 4)
q.add_bond(2, 3, 3)
q.add_bond(3, 4, 4)
q.add_bond(4, 5, 4)
q.add_bond(1, 5, 4)
atom_fix = {}
bonds_fix = ((2, 3, 4),)
rules.append((q, atom_fix, bonds_fix))

#
# C:[N+]:[C-]
# \\
# O
#
q = query.QueryContainer()
q.add_atom('N', neighbors=3, charge=1)
q.add_atom('O', neighbors=1)
q.add_atom('C', neighbors=(2, 3), charge=-1)
q.add_atom('C', neighbors=(2, 3))
q.add_bond(1, 2, 2)
q.add_bond(1, 3, 4)
q.add_bond(1, 4, 4)
atom_fix = {2: {'_charges': -1, '_hybridizations': 1}, 3: {'_charges': 0}}
bonds_fix = ((1, 2, 1),)
rules.append((q, atom_fix, bonds_fix))

#
# O=[N+] : C
# : :
# O : N : C
q = query.QueryContainer()
q.add_atom('N', neighbors=3, charge=1)
q.add_atom('O', neighbors=1)
q.add_atom('O', neighbors=2)
q.add_atom('C', neighbors=(2, 3))
q.add_atom('C', neighbors=(2, 3))
q.add_atom('N', neighbors=(2, 3))
q.add_bond(1, 2, 2)
q.add_bond(1, 3, 4)
q.add_bond(1, 4, 4)
q.add_bond(3, 6, 4)
q.add_bond(4, 5, 4)
q.add_bond(5, 6, 4)
atom_fix = {1: {'_hybridizations': 2}, 3: {'_hybridizations': 1}, 4: {'_hybridizations': 2},
5: {'_hybridizations': 2}, 6: {'_hybridizations': 1}}
bonds_fix = ((1, 3, 1), (1, 4, 1), (3, 6, 1), (4, 5, 2), (5, 6, 1))
rules.append((q, atom_fix, bonds_fix))

# todo: refactor!

# imidazolium
# R - N : C R - N : C
# : : : :
Expand Down
14 changes: 9 additions & 5 deletions CGRtools/files/SMILESrw.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@
dyn_charge_dict.update(tmp)
dyn_radical_dict = {'*': (True,), '*>^': (True, ('radical', None)), '^>*': (False, ('radical', None))}

atom_re = compile(r'([1-9][0-9]{0,2})?([A-IK-PR-Zacnopsb][a-ik-pr-vy]?)(@@|@)?(H[1-4]?)?([+-][1-4+-]?)?(:[0-9]{1,4})?')
dyn_atom_re = compile(r'([1-9][0-9]{0,2})?([A-IK-PR-Zacnopsb][a-ik-pr-vy]?)([+-0][1-4+-]?(>[+-0][1-4+-]?)?)?'
atom_re = compile(r'([1-9][0-9]{0,2})?([A-IK-PR-Zacnopsbt][a-ik-pr-vy]?)(@@|@)?(H[1-4]?)?([+-][1-4+-]?)?(:[0-9]{1,4})?')
dyn_atom_re = compile(r'([1-9][0-9]{0,2})?([A-IK-PR-Zacnopsbt][a-ik-pr-vy]?)([+-0][1-4+-]?(>[+-0][1-4+-]?)?)?'
r'([*^](>[*^])?)?')
delimiter = compile(r'[=:]')
cx_fragments = compile(r'f:(?:[0-9]+(?:\.[0-9]+)+)(?:,(?:[0-9]+(?:\.[0-9]+)+))*')
Expand Down Expand Up @@ -486,6 +486,9 @@ def _raw_tokenize(smiles):
if not token and s == '0':
raise IncorrectSmiles('number starts with 0')
token.append(s)
if len(token) == 2:
tokens.append((token_type, token))
token_type = token = None
else:
if s == '0':
raise IncorrectSmiles('number starts with 0')
Expand Down Expand Up @@ -624,7 +627,7 @@ def __atom_parse(token):
else:
mapping = 0

if element in ('c', 'n', 'o', 'p', 's', 'as', 'se', 'b'):
if element in ('c', 'n', 'o', 'p', 's', 'as', 'se', 'b', 'te'):
_type = 8
element = element.capitalize()
else:
Expand Down Expand Up @@ -662,7 +665,7 @@ def __dynatom_parse(token):
else:
is_radical = False

if element in ('c', 'n', 'o', 'p', 's', 'as', 'se', 'b'):
if element in ('c', 'n', 'o', 'p', 's', 'as', 'se', 'b', 'te'):
_type = 12
element = element.capitalize()
else:
Expand Down Expand Up @@ -793,7 +796,8 @@ def _parse_tokens(self, tokens):
if bt == 1:
bonds.append((atom_num, last_num, b))
elif bt == 9:
bonds.append((atom_num, last_num, 1))
bonds.append((atom_num, last_num,
4 if token_type in (8, 12) and atoms_types[last_num] in (8, 12) else 1))
stereo_bonds[last_num][atom_num] = b
stereo_bonds[atom_num][last_num] = not b
elif bt == 10:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def finalize_options(self):

setup(
name='CGRtools',
version='4.2.13',
version='4.2.14',
packages=['CGRtools', 'CGRtools.algorithms', 'CGRtools.algorithms.calculate2d', 'CGRtools.algorithms.components',
'CGRtools.algorithms.standardize', 'CGRtools.containers', 'CGRtools.files', 'CGRtools.files._mdl',
'CGRtools.periodictable', 'CGRtools.periodictable.element', 'CGRtools.reactor', 'CGRtools.utils',
Expand Down

0 comments on commit 597c648

Please sign in to comment.