Skip to content

Commit

Permalink
Indexable sdf (#19)
Browse files Browse the repository at this point in the history
* SDFread indexable
* RDFread refactored
  • Loading branch information
salikhovi4 authored and Ramil Nugmanov committed Mar 25, 2019
1 parent c80d64a commit 7bb903e
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 111 deletions.
117 changes: 26 additions & 91 deletions CGRtools/files/RDFrw.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@
#
from bisect import bisect_left
from collections import defaultdict
from itertools import chain, islice
from itertools import chain
from logging import warning
from os.path import getsize
from subprocess import check_output
from sys import platform
from time import strftime
from traceback import format_exc
from ._CGRrw import WithMixin, CGRread, CGRwrite
from ._MDLrw import MOLwrite, MOLread, EMOLread, RXNread, ERXNread, prepare_meta
from ._MDLrw import MOLwrite, MOLread, MDLread, EMOLread, RXNread, ERXNread, prepare_meta
from ..containers.common import BaseContainer
from ..exceptions import InvalidFileType


class RDFread(CGRread, WithMixin):
class RDFread(CGRread, WithMixin, MDLread):
"""
MDL RDF files reader. works similar to opened file object. support `with` context manager.
on initialization accept opened in text mode file, string path to file,
Expand All @@ -41,116 +41,54 @@ class RDFread(CGRread, WithMixin):
def __init__(self, file, *args, indexable=False, **kwargs):
super().__init__(*args, **kwargs)
super(CGRread, self).__init__(file)
self.__data = self.__reader()
self._data = self.__reader()

if indexable and platform != 'win32' and not self._is_buffer:
self.__file = iter(self._file.readline, '')
if next(self.__data):
self.__shifts = [int(x.split(':', 1)[0]) for x in
check_output(['grep', '-boE', r'^\$[RM]FMT', self._file.name]).decode().split()]
self.__shifts.append(getsize(self._file.name))
if next(self._data):
self._shifts = [int(x.split(b':', 1)[0]) for x in
check_output(['grep', '-boE', r'^\$[RM]FMT', self._file.name]).split()]
self._shifts.append(getsize(self._file.name))
else:
self.__file = self._file
next(self.__data)

def read(self):
"""
parse whole file
:return: list of parsed molecules or reactions
"""
return list(iter(self)) # __len__ method call skip

def __iter__(self):
return (x for x in self.__data if x is not None)

def __len__(self):
if self.__shifts:
return len(self.__shifts) - 1
raise self.__implement_error

def __next__(self):
return next(iter(self))
next(self._data)

def seek(self, offset):
if self.__shifts:
if 0 <= offset < len(self.__shifts):
if self._shifts:
if 0 <= offset < len(self._shifts):
current_pos = self._file.tell()
new_pos = self.__shifts[offset]
new_pos = self._shifts[offset]
if current_pos != new_pos:
if current_pos == self.__shifts[-1]: # reached the end of the file
self.__data = self.__reader()
if current_pos == self._shifts[-1]: # reached the end of the file
self._data = self.__reader()
self.__file = iter(self._file.readline, '')
self._file.seek(0)
next(self.__data)
next(self._data)
if offset: # move not to the beginning of the file
self._file.seek(new_pos)
else:
if not self.__already_seeked:
if self.__shifts[0] < current_pos: # in the middle of the file
self.__data.send(True)
if self._shifts[0] < current_pos: # in the middle of the file
self._data.send(True)
self.__already_seeked = True
self._file.seek(new_pos)
else:
raise IndexError('invalid offset')
else:
raise self.__implement_error
raise self._implement_error

def tell(self):
if self.__shifts:
if self._shifts:
t = self._file.tell()
if t == self.__shifts[0]:
if t == self._shifts[0]:
return 0
elif t == self.__shifts[-1]:
return len(self.__shifts) - 1
elif t in self.__shifts:
return bisect_left(self.__shifts, t)
elif t == self._shifts[-1]:
return len(self._shifts) - 1
elif t in self._shifts:
return bisect_left(self._shifts, t)
else:
return bisect_left(self.__shifts, t) - 1
raise self.__implement_error

def __getitem__(self, item):
"""
getting the item by index from the original file,
if the required block of the file with an error,
then only the correct blocks are returned
:param item: int or slice
:return: ReactionContainer or list of ReactionContainers
"""
if self.__shifts:
_len = len(self.__shifts) - 1
_current_pos = self.tell()

if isinstance(item, int):
if item >= _len or item < -_len:
raise IndexError('List index out of range')
if item < 0:
item += _len
self.seek(item)
records = next(self.__data)
elif isinstance(item, slice):
start, stop, step = item.indices(_len)
if start == stop:
return []

if step == 1:
self.seek(start)
records = [x for x in islice(self.__data, 0, stop - start) if x is not None]
else:
records = []
for index in range(start, stop, step):
self.seek(index)
record = next(self.__data)
if record:
records.append(record)
else:
raise TypeError('Indices must be integers or slices')

self.seek(_current_pos)
if records is None:
raise self.__index_error
return records
raise self.__implement_error
return bisect_left(self._shifts, t) - 1
raise self._implement_error

def __reader(self):
record = parser = mkey = None
Expand Down Expand Up @@ -257,10 +195,7 @@ def __reader(self):
warning(f'record consist errors:\n{format_exc()}')
yield None

__shifts = None
__implement_error = NotImplementedError('Indexable supported in unix-like o.s. and for files stored on disk')
__already_seeked = False
__index_error = IndexError('Data block with requested index contain errors')


class RDFwrite(MOLwrite, WithMixin):
Expand Down
62 changes: 43 additions & 19 deletions CGRtools/files/SDFrw.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,44 +16,64 @@
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, see <https://www.gnu.org/licenses/>.
#
from bisect import bisect_left
from collections import defaultdict
from io import BytesIO
from logging import warning
from subprocess import check_output
from sys import platform
from traceback import format_exc
from ._CGRrw import WithMixin, CGRread, CGRwrite
from ._MDLrw import MOLwrite, MOLread, EMOLread, prepare_meta
from ._MDLrw import MOLwrite, MOLread, MDLread, EMOLread, prepare_meta


class SDFread(CGRread, WithMixin):
class SDFread(CGRread, WithMixin, MDLread):
"""
MDL SDF files reader. works similar to opened file object. support `with` context manager.
on initialization accept opened in text mode file, string path to file,
pathlib.Path object or another buffered reader object
"""
def __init__(self, file, *args, **kwargs):
def __init__(self, file, *args, indexable=False, **kwargs):
super().__init__(*args, **kwargs)
super(CGRread, self).__init__(file)
self.__data = self.__reader()

def read(self):
"""
parse whole file
:return: list of parsed molecules
"""
return list(self.__data)

def __iter__(self):
return self.__data

def __next__(self):
return next(self.__data)
self._data = self.__reader()

if indexable and platform != 'win32' and not self._is_buffer:
self.__file = iter(self._file.readline, '')
self._shifts = [0]
for x in BytesIO(check_output(['grep', '-bE', r'\$\$\$\$', self._file.name])):
_pos, _line = x.split(b':', 1)
self._shifts.append(int(_pos) + len(_line))
else:
self.__file = self._file

def seek(self, offset):
if self._shifts:
if 0 <= offset < len(self._shifts):
current_pos = self._file.tell()
new_pos = self._shifts[offset]
if current_pos != new_pos:
if current_pos == self._shifts[-1]: # reached the end of the file
self._data = self.__reader()
self.__file = iter(self._file.readline, '')
self._file.seek(new_pos)
else:
raise IndexError('invalid offset')
else:
raise self._implement_error

def tell(self):
if self._shifts:
t = self._file.tell()
return bisect_left(self._shifts, t)
raise self._implement_error

def __reader(self):
im = 3
failkey = False
mkey = parser = record = None
meta = defaultdict(list)
for line in self._file:
for line in self.__file:
if failkey and not line.startswith("$$$$"):
continue
elif parser:
Expand All @@ -65,6 +85,7 @@ def __reader(self):
failkey = True
parser = None
warning(f'line:\n{line}\nconsist errors:\n{format_exc()}')
yield None

elif line.startswith("$$$$"):
if record:
Expand All @@ -73,6 +94,7 @@ def __reader(self):
yield self._convert_structure(record)
except ValueError:
warning(f'record consist errors:\n{format_exc()}')
yield None
record = None

im = 3
Expand Down Expand Up @@ -101,13 +123,15 @@ def __reader(self):
except ValueError:
failkey = True
warning(f'line:\n{line}\nconsist errors:\n{format_exc()}')
yield None

if record: # True for MOL file only.
record['meta'] = prepare_meta(meta)
try:
yield self._convert_structure(record)
except ValueError:
warning(f'record consist errors:\n{format_exc()}')
yield None


class SDFwrite(MOLwrite, WithMixin):
Expand Down
70 changes: 69 additions & 1 deletion CGRtools/files/_MDLrw.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#
from csv import reader
from logging import warning
from itertools import count, chain
from itertools import count, chain, islice
from ._CGRrw import CGRwrite, cgr_keys
from ..exceptions import EmptyMolecule
from ..periodictable import common_isotopes
Expand Down Expand Up @@ -434,6 +434,74 @@ def getvalue(self):
__in_mol = 0


class MDLread:
def read(self):
"""
parse whole file
:return: list of parsed molecules
"""
return list(iter(self))

def __iter__(self):
return (x for x in self._data if x is not None)

def __next__(self):
return next(iter(self))

def __len__(self):
if self._shifts:
return len(self._shifts) - 1
raise self._implement_error

def __getitem__(self, item):
"""
getting the item by index from the original file,
if the required block of the file with an error,
then only the correct blocks are returned
:param item: int or slice
:return: [Molecule, Reaction]Container or list of [Molecule, Reaction]Containers
"""
if self._shifts:
_len = len(self._shifts) - 1
_current_pos = self.tell()

if isinstance(item, int):
if item >= _len or item < -_len:
raise IndexError('List index out of range')
if item < 0:
item += _len
self.seek(item)
records = next(self._data)
elif isinstance(item, slice):
start, stop, step = item.indices(_len)
if start == stop:
return []

if step == 1:
self.seek(start)
records = [x for x in islice(self._data, 0, stop - start) if x is not None]
else:
records = []
for index in range(start, stop, step):
self.seek(index)
record = next(self._data)
if record:
records.append(record)
else:
raise TypeError('Indices must be integers or slices')

self.seek(_current_pos)
if records is None:
raise self._index_error
return records
raise self._implement_error

_shifts = None
_implement_error = NotImplementedError('Indexable supported in unix-like o.s. and for files stored on disk')
_index_error = IndexError('Data block with requested index contain errors')


class MOLwrite(CGRwrite):
@staticmethod
def _format_mol(atoms, bonds, cgr):
Expand Down

0 comments on commit 7bb903e

Please sign in to comment.