Skip to content

Commit

Permalink
smiles generation algorithm refactored.
Browse files Browse the repository at this point in the history
caution! fixed rare bug.
new canonic ordering.
now supports stereo.

old algorithm available by formatting with key 'o'.
  • Loading branch information
stsouko committed Jan 20, 2021
1 parent bdd3674 commit 59948ab
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 31 deletions.
90 changes: 62 additions & 28 deletions CGRtools/algorithms/smiles.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright 2017-2020 Ramil Nugmanov <nougmanoff@protonmail.com>
# Copyright 2017-2021 Ramil Nugmanov <nougmanoff@protonmail.com>
# Copyright 2019 Timur Gimadiev <timur.gimadiev@gmail.com>
# This file is part of CGRtools.
#
Expand Down Expand Up @@ -47,7 +47,7 @@ class Smiles:

@cached_method
def __str__(self):
return ''.join(self._smiles(self.atoms_order.get))
return ''.join(self._smiles(self._smiles_order))

def __format__(self, format_spec):
"""
Expand All @@ -58,12 +58,13 @@ def __format__(self, format_spec):
!s - Disable stereo marks.
!h - Disable hybridization marks in queries. Returns non-unique signature.
!n - Disable neighbors marks in queries. Returns non-unique signature.
!g - Disable hydrogens marks in queries. Returns non-unique signature.
!c - Disable rings marks in queries. Returns non-unique signature.
!w - Disable heteroatoms marks in queries. Returns non-unique signature.
t - Use aromatic bonds instead aromatic atoms.
!H - Disable hydrogens marks in queries. Returns non-unique signature.
!R - Disable rings marks in queries. Returns non-unique signature.
!t - Disable heteroatoms marks in queries. Returns non-unique signature.
A - Use aromatic bonds instead aromatic atoms.
m - Set atom mapping.
r - Generate random-ordered smiles.
o - Old canonic ordering algorithm.
Combining possible. Order independent. Another keys ignored.
"""
Expand All @@ -77,21 +78,26 @@ def __format__(self, format_spec):
kwargs['hybridization'] = False
if '!n' in format_spec:
kwargs['neighbors'] = False
if 't' in format_spec:
if 'A' in format_spec:
kwargs['aromatic'] = False
if 'm' in format_spec:
kwargs['mapping'] = True
if '!g' in format_spec:
if '!H' in format_spec:
kwargs['hydrogens'] = False
if '!c' in format_spec:
if '!R' in format_spec:
kwargs['rings'] = False
if '!w' in format_spec:
if '!t' in format_spec:
kwargs['heteroatoms'] = False
if 'r' in format_spec:
kwargs['random'] = True

def w(x):
return random()
else:
elif 'o' in format_spec:
kwargs['old_order'] = True
w = self.atoms_order.get
else:
w = self._smiles_order
return ''.join(self._smiles(w, **kwargs))
return str(self)

Expand Down Expand Up @@ -122,26 +128,37 @@ def _smiles(self, weights, *, asymmetric_closures=False, open_parenthesis='(', c

groups = defaultdict(int)
for n in atoms_set:
groups[weights(n)] += 1
groups[weights(n)] -= 1

if kwargs.get('random', False):
mod_weights_start = mod_weights = weights
elif kwargs.get('old_order', False):
def mod_weights_start(x):
lb = len(bonds[x])
if lb:
return (-groups[weights(x)], # rare groups
-lb, # more neighbors
lb / len({weights(x) for x in bonds[x]}), # more unique neighbors
weights(x)) # smallest weight
else:
return -groups[weights(x)], weights(x) # rare groups > smallest weight

def mod_weights_start(x):
# precedence of:
lb = len(bonds[x])
if lb:
return (groups[weights(x)], # rare groups
def mod_weights(x):
lb = len(bonds[x])
return (-groups[weights(x)], # rare groups
-lb, # more neighbors
lb / len({weights(x) for x in bonds[x]}), # more unique neighbors
weights(x), # smallest weight
seen[x]) # BFS nearest to starting
else:
def mod_weights_start(x):
return (groups[weights(x)], # common groups
weights(x)) # smallest weight
else:
return groups[weights(x)], weights(x) # rare groups > smallest weight

def mod_weights(x):
lb = len(bonds[x])
return (groups[weights(x)], # rare groups
-lb, # more neighbors
lb / len({weights(x) for x in bonds[x]}), # more unique neighbors
weights(x), # smallest weight
seen[x]) # BFS nearest to starting
def mod_weights(x):
return (groups[weights(x)], # common groups
weights(x), # smallest weight
seen[x]) # BFS nearest to starting

while True:
start = min(atoms_set, key=mod_weights_start)
Expand Down Expand Up @@ -173,8 +190,9 @@ def mod_weights(x):
front = bonds[child].keys() - {parent}
if front:
stack.append((child, depth_now - 1, iter(sorted(front, key=mod_weights))))
elif child not in disconnected:
disconnected.add(parent)
elif (child, parent) not in disconnected:
disconnected.add((parent, child))
disconnected.add((child, parent))
cycle = next(cycles)
tokens[parent].append((child, cycle))
tokens[child].append((parent, cycle))
Expand Down Expand Up @@ -260,6 +278,10 @@ def _format_closure(c):
class MoleculeSmiles(Smiles):
__slots__ = ()

@property
def _smiles_order(self):
return self._chiral_morgan.__getitem__

def _format_atom(self, n, adjacency, **kwargs):
atom = self._atoms[n]
charge = self._charges[n]
Expand Down Expand Up @@ -361,6 +383,10 @@ def _format_bond(self, n, m, adjacency, **kwargs):
class CGRSmiles(Smiles):
__slots__ = ()

@property
def _smiles_order(self):
return self.atoms_order.__getitem__

def _format_atom(self, n, **kwargs):
atom = self._atoms[n]
charge = self._charges[n]
Expand Down Expand Up @@ -401,6 +427,10 @@ def _format_bond(self, n, m, **kwargs):
class QuerySmiles(Smiles):
__slots__ = ()

@property
def _smiles_order(self):
return self.atoms_order.__getitem__

def _format_atom(self, n, **kwargs):
atom = self._atoms[n]
charge = self._charges[n]
Expand Down Expand Up @@ -454,6 +484,10 @@ def _format_bond(self, n, m, **kwargs):
class QueryCGRSmiles(Smiles):
__slots__ = ()

@property
def _smiles_order(self):
return self.atoms_order.__getitem__

def _format_atom(self, n, **kwargs):
atom = self._atoms[n]
charge = self._charges[n]
Expand Down
8 changes: 6 additions & 2 deletions CGRtools/algorithms/stereo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright 2019, 2020 Ramil Nugmanov <nougmanoff@protonmail.com>
# Copyright 2019-2021 Ramil Nugmanov <nougmanoff@protonmail.com>
# This file is part of CGRtools.
#
# CGRtools is free software; you can redistribute it and/or modify
Expand Down Expand Up @@ -714,6 +714,10 @@ def _chiral_cis_trans(self) -> Set[Tuple[int, int]]:
def _chiral_allenes(self) -> Set[int]:
return self.__chiral_centers[2]

@property
def _chiral_morgan(self) -> Dict[int, int]:
return self.__chiral_centers[3]

@cached_property
def _stereo_axises(self: 'MoleculeContainer') -> Tuple[Tuple[Tuple[int, ...], ...], Tuple[Tuple[int, ...], ...]]:
"""
Expand Down Expand Up @@ -974,7 +978,7 @@ def __chiral_centers(self: 'MoleculeContainer'):
morgan = self._morgan({**morgan, **morgan_update})
morgan_update = {}
else:
return chiral_t, {(n, m) for n, *_, m in chiral_c}, {path[len(path) // 2] for path in chiral_a}
return chiral_t, {(n, m) for n, *_, m in chiral_c}, {path[len(path) // 2] for path in chiral_a}, morgan


class QueryStereo(Stereo): # todo: implement add_wedge
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def finalize_options(self):

setup(
name='CGRtools',
version='4.1.8',
version='4.1.9',
packages=['CGRtools', 'CGRtools.algorithms', 'CGRtools.algorithms.components', 'CGRtools.containers',
'CGRtools.files', 'CGRtools.files._mdl', 'CGRtools.periodictable', 'CGRtools.periodictable.element',
'CGRtools.utils', 'CGRtools.attributes'],
Expand Down

0 comments on commit 59948ab

Please sign in to comment.