Skip to content

Commit

Permalink
molecule package (#189)
Browse files Browse the repository at this point in the history
new molecule storage format. 10 times smaller than pickle
  • Loading branch information
stsouko committed Jul 16, 2021
1 parent 9124310 commit b1d691a
Show file tree
Hide file tree
Showing 2 changed files with 206 additions and 1 deletion.
205 changes: 205 additions & 0 deletions CGRtools/containers/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
from CachedMethods import cached_args_method, cached_property
from collections import defaultdict, Counter
from importlib.util import find_spec
from itertools import zip_longest
from math import ceil
from struct import pack_into, unpack_from
from typing import List, Union, Tuple, Optional, Dict, FrozenSet
from zlib import compress, decompress
from . import cgr, query # cyclic imports resolve
from .bonds import Bond, DynamicBond, QueryBond
from .common import Graph
Expand Down Expand Up @@ -592,6 +596,207 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.flush_cache()
del self._backup

def pack(self) -> bytes:
"""
Pack into compressed bytes.
Note:
* Less than 4096 atoms supported. Atoms mapping should be in range 1-4095.
* Implicit hydrogens count should be in range 0-7
* Isotope shift should be in range -15 - 15 relatively mdl.common_isotopes
* Atoms neighbors should be in range 0-15
Format specification:
12 bit - number of atoms
12 bit - cis/trans stereo block size
Atom block 9 bytes (repeated):
12 bit - atom number
4 bit - number of neighbors
2 bit tetrahedron sign (00 - not stereo, 10 or 11 - has stereo)
2 bit - allene sign
5 bit - isotope (00000 - not specified, over = isotope - common_isotope + 16)
7 bit - atomic number (<=118)
32 bit - XY float16 coordinates
3 bit - hydrogens (0-7)
4 bit - charge (charge + 4. possible range -4 - 4)
1 bit - radical state
Connection table: flatten list of neighbors. neighbors count stored in atom block.
For example CC(=O)O - {1: [2], 2: [1, 3, 4], 3: [2], 4: [2]} >> [2, 1, 3, 4, 2, 2].
Repeated block (equal to bonds count).
24 bit - paired 12 bit numbers.
Bonds order block (repeated):
16 bit - 5 bonds grouped (3 bit each). 1 bit unused. Zero padding used than bonds count not proportional to 5.
Cis/trans data block (repeated):
24 bit - atoms pair
7 bit - zero padding
1 bit - sign
"""
bonds = self._bonds
if max(bonds) > 4095:
raise ValueError('Big molecules not supported')
if any(len(x) > 15 for x in bonds.values()):
raise ValueError('To many neighbors not supported')
from ..files._mdl.mol import common_isotopes

plane = self._plane
charges = self._charges
radicals = self._radicals
hydrogens = self._hydrogens
atoms_stereo = self._atoms_stereo
allenes_stereo = self._allenes_stereo
cis_trans_stereo = self._cis_trans_stereo

data = bytearray(3 + # atoms count + cis/trans bit
9 * self.atoms_count + # atoms data
3 * self.bonds_count + # connection table
2 * ceil(self.bonds_count / 5) + # bonds order
4 * len(cis_trans_stereo))
pack_into('HB', data, 0, (self.atoms_count << 4) | (len(cis_trans_stereo) >> 8), len(cis_trans_stereo) & 0xff)
shift = 3

neighbors = []
bonds_pack = []
seen = set()
for o, (n, a) in enumerate(self._atoms.items()):
bs = bonds[n]
neighbors.extend(bs)
seen.add(n)
for m, b in bs.items():
if m not in seen:
bonds_pack.append(b.order - 1) # 8 - 4 bit, but 7 - 3 bit

# 3 bit - hydrogens (0-7) | 4 bit - charge | 1 bit - radical
hcr = (charges[n] + 4) << 1
if radicals[n]:
hcr |= 1
hcr |= hydrogens[n] << 5

# 2 bit tetrahedron sign | 2 bit - allene sign | 5 bit - isotope | 7 bit - atomic number (<=118)
sia = a.atomic_number
if a.isotope:
sia |= (a.isotope - common_isotopes[a.atomic_symbol] + 16) << 7

if n in atoms_stereo:
if atoms_stereo[n]:
sia |= 0xc000
else:
sia |= 0x8000
if n in allenes_stereo:
if allenes_stereo[n]:
sia |= 0x3000
else:
sia |= 0x2000

pack_into('2H2eB', data, shift + 9 * o,
(n << 4) | len(bs), # 12 bit - atom number | 4 bit - number of neighbors
sia, *plane[n], hcr)

shift += 9 * self.atoms_count
ngb = iter(neighbors)
for o, (n1, n2) in enumerate(zip_longest(ngb, ngb)):
# 12 bit + 12 bit
pack_into('I', data, shift + 3 * o, (n1 << 12) | n2)

# 16 bit - 5 bonds packing. 1 bit empty.
shift += 3 * self.bonds_count
bp = iter(bonds_pack)
for o, (b1, b2, b3, b4, b5) in enumerate(zip_longest(bp, bp, bp, bp, bp, fillvalue=0)):
pack_into('H', data, shift + 2 * o, (b1 << 12) | (b2 << 9) | (b3 << 6) | (b4 << 3) | b5)

shift += 2 * ceil(self.bonds_count / 5)
for o, ((n, m), s) in enumerate(cis_trans_stereo.items()):
pack_into('I', data, shift + 4 * o, (n << 20) | (m << 8) | s)

# 16 bit - neighbor | 3 bit bond type
return compress(bytes(data), 9)

@classmethod
def unpack(cls, data: bytes) -> 'MoleculeContainer':
"""
Unpack from compressed bytes.
"""
data = decompress(data)
from ..files._mdl.mol import common_isotopes
mol = cls()
atoms = mol._atoms
bonds = mol._bonds
plane = mol._plane
charges = mol._charges
radicals = mol._radicals
hydrogens = mol._hydrogens
atoms_stereo = mol._atoms_stereo
allenes_stereo = mol._allenes_stereo
cis_trans_stereo = mol._cis_trans_stereo

neighbors = {}
acs, cts = unpack_from('HB', data, 0)
cts |= (acs & 0x0f) << 8
shift = 3
for o in range(acs >> 4):
nn, sia, x, y, hcr = unpack_from('2H2eB', data, shift + 9 * o)
n = nn >> 4
neighbors[n] = nn & 0x0f
# stereo
s = sia >> 14
if s:
atoms_stereo[n] = s == 3
s = (sia >> 12) & 3
if s:
allenes_stereo[n] = s == 3

# atoms
a = Element.from_atomic_number(sia & 0x7f)
ai = (sia >> 7) & 0x1f
if ai:
ai += common_isotopes[a.__name__] - 16
else:
ai = None
atoms[n] = a = a(ai)
a._attach_to_graph(mol, n)

charges[n] = ((hcr >> 1) & 0x0f) - 4
radicals[n] = bool(hcr & 0x01)
hydrogens[n] = hcr >> 5
plane[n] = (x, y)

bc = sum(neighbors.values()) // 2
shift += 9 * len(neighbors)
connections = []
for o in range(bc):
nn = unpack_from('I', data, shift + 3 * o)[0]
connections.append((nn >> 12) & 0x0fff)
connections.append(nn & 0x0fff)

shift += 3 * bc
orders = []
for o in range(ceil(bc / 5)):
bb = unpack_from('H', data, shift + 2 * o)[0]
orders.append(((bb >> 12) & 0x07) + 1)
orders.append(((bb >> 9) & 0x07) + 1)
orders.append(((bb >> 6) & 0x07) + 1)
orders.append(((bb >> 3) & 0x07) + 1)
orders.append((bb & 0x07) + 1)
orders = orders[:bc] # skip padding

con = iter(connections)
ords = iter(orders)
for n, ms in neighbors.items():
bonds[n] = cbn = {}
for _ in range(ms):
m = next(con)
if m in bonds: # bond partially exists. need back-connection.
cbn[m] = bonds[m][n]
else:
cbn[m] = Bond(next(ords))

shift += 2 * ceil(bc / 5)
for o in range(cts): # cis/trans
ct = unpack_from('I', data, shift + 4 * o)[0]
cis_trans_stereo[(ct >> 20, (ct >> 8) & 0x0fff)] = ct & 0x01

for n in neighbors:
mol._calc_hybridization(n)
return mol

def __getstate__(self):
return {'conformers': self._conformers, 'hydrogens': self._hydrogens, 'atoms_stereo': self._atoms_stereo,
'allenes_stereo': self._allenes_stereo, 'cis_trans_stereo': self._cis_trans_stereo,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def finalize_options(self):

setup(
name='CGRtools',
version='4.2.15',
version='4.2.16',
packages=['CGRtools', 'CGRtools.algorithms', 'CGRtools.algorithms.calculate2d', 'CGRtools.algorithms.components',
'CGRtools.algorithms.standardize', 'CGRtools.containers', 'CGRtools.files', 'CGRtools.files._mdl',
'CGRtools.periodictable', 'CGRtools.periodictable.element', 'CGRtools.reactor', 'CGRtools.utils',
Expand Down

0 comments on commit b1d691a

Please sign in to comment.