Indexable sdf (#19)

* SDFread indexable * RDFread refactored
cimm-kzn · Mar 25, 2019 · 7bb903e · 7bb903e
1 parent c80d64a
commit 7bb903e
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 111 deletions.
diff --git a/CGRtools/files/RDFrw.py b/CGRtools/files/RDFrw.py
@@ -19,20 +19,20 @@
 #
 from bisect import bisect_left
 from collections import defaultdict
-from itertools import chain, islice
+from itertools import chain
 from logging import warning
 from os.path import getsize
 from subprocess import check_output
 from sys import platform
 from time import strftime
 from traceback import format_exc
 from ._CGRrw import WithMixin, CGRread, CGRwrite
-from ._MDLrw import MOLwrite, MOLread, EMOLread, RXNread, ERXNread, prepare_meta
+from ._MDLrw import MOLwrite, MOLread, MDLread, EMOLread, RXNread, ERXNread, prepare_meta
 from ..containers.common import BaseContainer
 from ..exceptions import InvalidFileType
 
 
-class RDFread(CGRread, WithMixin):
+class RDFread(CGRread, WithMixin, MDLread):
     """
     MDL RDF files reader. works similar to opened file object. support `with` context manager.
     on initialization accept opened in text mode file, string path to file,
@@ -41,116 +41,54 @@ class RDFread(CGRread, WithMixin):
     def __init__(self, file, *args, indexable=False, **kwargs):
         super().__init__(*args, **kwargs)
         super(CGRread, self).__init__(file)
-        self.__data = self.__reader()
+        self._data = self.__reader()
 
         if indexable and platform != 'win32' and not self._is_buffer:
             self.__file = iter(self._file.readline, '')
-            if next(self.__data):
-                self.__shifts = [int(x.split(':', 1)[0]) for x in
-                                 check_output(['grep', '-boE', r'^\$[RM]FMT', self._file.name]).decode().split()]
-                self.__shifts.append(getsize(self._file.name))
+            if next(self._data):
+                self._shifts = [int(x.split(b':', 1)[0]) for x in
+                                check_output(['grep', '-boE', r'^\$[RM]FMT', self._file.name]).split()]
+                self._shifts.append(getsize(self._file.name))
         else:
             self.__file = self._file
-            next(self.__data)
-
-    def read(self):
-        """
-        parse whole file
-
-        :return: list of parsed molecules or reactions
-        """
-        return list(iter(self))  # __len__ method call skip
-
-    def __iter__(self):
-        return (x for x in self.__data if x is not None)
-
-    def __len__(self):
-        if self.__shifts:
-            return len(self.__shifts) - 1
-        raise self.__implement_error
-
-    def __next__(self):
-        return next(iter(self))
+            next(self._data)
 
     def seek(self, offset):
-        if self.__shifts:
-            if 0 <= offset < len(self.__shifts):
+        if self._shifts:
+            if 0 <= offset < len(self._shifts):
                 current_pos = self._file.tell()
-                new_pos = self.__shifts[offset]
+                new_pos = self._shifts[offset]
                 if current_pos != new_pos:
-                    if current_pos == self.__shifts[-1]:  # reached the end of the file
-                        self.__data = self.__reader()
+                    if current_pos == self._shifts[-1]:  # reached the end of the file
+                        self._data = self.__reader()
                         self.__file = iter(self._file.readline, '')
                         self._file.seek(0)
-                        next(self.__data)
+                        next(self._data)
                         if offset:  # move not to the beginning of the file
                             self._file.seek(new_pos)
                     else:
                         if not self.__already_seeked:
-                            if self.__shifts[0] < current_pos:  # in the middle of the file
-                                self.__data.send(True)
+                            if self._shifts[0] < current_pos:  # in the middle of the file
+                                self._data.send(True)
                             self.__already_seeked = True
                         self._file.seek(new_pos)
             else:
                 raise IndexError('invalid offset')
         else:
-            raise self.__implement_error
+            raise self._implement_error
 
     def tell(self):
-        if self.__shifts:
+        if self._shifts:
             t = self._file.tell()
-            if t == self.__shifts[0]:
+            if t == self._shifts[0]:
                 return 0
-            elif t == self.__shifts[-1]:
-                return len(self.__shifts) - 1
-            elif t in self.__shifts:
-                return bisect_left(self.__shifts, t)
+            elif t == self._shifts[-1]:
+                return len(self._shifts) - 1
+            elif t in self._shifts:
+                return bisect_left(self._shifts, t)
             else:
-                return bisect_left(self.__shifts, t) - 1
-        raise self.__implement_error
-
-    def __getitem__(self, item):
-        """
-        getting the item by index from the original file,
-        if the required block of the file with an error,
-        then only the correct blocks are returned
-        :param item: int or slice
-        :return: ReactionContainer or list of ReactionContainers
-        """
-        if self.__shifts:
-            _len = len(self.__shifts) - 1
-            _current_pos = self.tell()
-
-            if isinstance(item, int):
-                if item >= _len or item < -_len:
-                    raise IndexError('List index out of range')
-                if item < 0:
-                    item += _len
-                self.seek(item)
-                records = next(self.__data)
-            elif isinstance(item, slice):
-                start, stop, step = item.indices(_len)
-                if start == stop:
-                    return []
-
-                if step == 1:
-                    self.seek(start)
-                    records = [x for x in islice(self.__data, 0, stop - start) if x is not None]
-                else:
-                    records = []
-                    for index in range(start, stop, step):
-                        self.seek(index)
-                        record = next(self.__data)
-                        if record:
-                            records.append(record)
-            else:
-                raise TypeError('Indices must be integers or slices')
-
-            self.seek(_current_pos)
-            if records is None:
-                raise self.__index_error
-            return records
-        raise self.__implement_error
+                return bisect_left(self._shifts, t) - 1
+        raise self._implement_error
 
     def __reader(self):
         record = parser = mkey = None
@@ -257,10 +195,7 @@ def __reader(self):
                 warning(f'record consist errors:\n{format_exc()}')
                 yield None
 
-    __shifts = None
-    __implement_error = NotImplementedError('Indexable supported in unix-like o.s. and for files stored on disk')
     __already_seeked = False
-    __index_error = IndexError('Data block with requested index contain errors')
 
 
 class RDFwrite(MOLwrite, WithMixin):

diff --git a/CGRtools/files/SDFrw.py b/CGRtools/files/SDFrw.py
@@ -16,44 +16,64 @@
 #  You should have received a copy of the GNU Lesser General Public License
 #  along with this program; if not, see <https://www.gnu.org/licenses/>.
 #
+from bisect import bisect_left
 from collections import defaultdict
+from io import BytesIO
 from logging import warning
+from subprocess import check_output
+from sys import platform
 from traceback import format_exc
 from ._CGRrw import WithMixin, CGRread, CGRwrite
-from ._MDLrw import MOLwrite, MOLread, EMOLread, prepare_meta
+from ._MDLrw import MOLwrite, MOLread, MDLread, EMOLread, prepare_meta
 
 
-class SDFread(CGRread, WithMixin):
+class SDFread(CGRread, WithMixin, MDLread):
     """
     MDL SDF files reader. works similar to opened file object. support `with` context manager.
     on initialization accept opened in text mode file, string path to file,
     pathlib.Path object or another buffered reader object
     """
-    def __init__(self, file, *args, **kwargs):
+    def __init__(self, file, *args, indexable=False, **kwargs):
         super().__init__(*args, **kwargs)
         super(CGRread, self).__init__(file)
-        self.__data = self.__reader()
-
-    def read(self):
-        """
-        parse whole file
-
-        :return: list of parsed molecules
-        """
-        return list(self.__data)
-
-    def __iter__(self):
-        return self.__data
-
-    def __next__(self):
-        return next(self.__data)
+        self._data = self.__reader()
+
+        if indexable and platform != 'win32' and not self._is_buffer:
+            self.__file = iter(self._file.readline, '')
+            self._shifts = [0]
+            for x in BytesIO(check_output(['grep', '-bE', r'\$\$\$\$', self._file.name])):
+                _pos, _line = x.split(b':', 1)
+                self._shifts.append(int(_pos) + len(_line))
+        else:
+            self.__file = self._file
+
+    def seek(self, offset):
+        if self._shifts:
+            if 0 <= offset < len(self._shifts):
+                current_pos = self._file.tell()
+                new_pos = self._shifts[offset]
+                if current_pos != new_pos:
+                    if current_pos == self._shifts[-1]:  # reached the end of the file
+                        self._data = self.__reader()
+                        self.__file = iter(self._file.readline, '')
+                    self._file.seek(new_pos)
+            else:
+                raise IndexError('invalid offset')
+        else:
+            raise self._implement_error
+
+    def tell(self):
+        if self._shifts:
+            t = self._file.tell()
+            return bisect_left(self._shifts, t)
+        raise self._implement_error
 
     def __reader(self):
         im = 3
         failkey = False
         mkey = parser = record = None
         meta = defaultdict(list)
-        for line in self._file:
+        for line in self.__file:
             if failkey and not line.startswith("$$$$"):
                 continue
             elif parser:
@@ -65,6 +85,7 @@ def __reader(self):
                     failkey = True
                     parser = None
                     warning(f'line:\n{line}\nconsist errors:\n{format_exc()}')
+                    yield None
 
             elif line.startswith("$$$$"):
                 if record:
@@ -73,6 +94,7 @@ def __reader(self):
                         yield self._convert_structure(record)
                     except ValueError:
                         warning(f'record consist errors:\n{format_exc()}')
+                        yield None
                     record = None
 
                 im = 3
@@ -101,13 +123,15 @@ def __reader(self):
                 except ValueError:
                     failkey = True
                     warning(f'line:\n{line}\nconsist errors:\n{format_exc()}')
+                    yield None
 
         if record:  # True for MOL file only.
             record['meta'] = prepare_meta(meta)
             try:
                 yield self._convert_structure(record)
             except ValueError:
                 warning(f'record consist errors:\n{format_exc()}')
+                yield None
 
 
 class SDFwrite(MOLwrite, WithMixin):

diff --git a/CGRtools/files/_MDLrw.py b/CGRtools/files/_MDLrw.py
@@ -18,7 +18,7 @@
 #
 from csv import reader
 from logging import warning
-from itertools import count, chain
+from itertools import count, chain, islice
 from ._CGRrw import CGRwrite, cgr_keys
 from ..exceptions import EmptyMolecule
 from ..periodictable import common_isotopes
@@ -434,6 +434,74 @@ def getvalue(self):
     __in_mol = 0
 
 
+class MDLread:
+    def read(self):
+        """
+        parse whole file
+
+        :return: list of parsed molecules
+        """
+        return list(iter(self))
+
+    def __iter__(self):
+        return (x for x in self._data if x is not None)
+
+    def __next__(self):
+        return next(iter(self))
+
+    def __len__(self):
+        if self._shifts:
+            return len(self._shifts) - 1
+        raise self._implement_error
+
+    def __getitem__(self, item):
+        """
+        getting the item by index from the original file,
+        if the required block of the file with an error,
+        then only the correct blocks are returned
+        :param item: int or slice
+        :return: [Molecule, Reaction]Container or list of [Molecule, Reaction]Containers
+        """
+        if self._shifts:
+            _len = len(self._shifts) - 1
+            _current_pos = self.tell()
+
+            if isinstance(item, int):
+                if item >= _len or item < -_len:
+                    raise IndexError('List index out of range')
+                if item < 0:
+                    item += _len
+                self.seek(item)
+                records = next(self._data)
+            elif isinstance(item, slice):
+                start, stop, step = item.indices(_len)
+                if start == stop:
+                    return []
+
+                if step == 1:
+                    self.seek(start)
+                    records = [x for x in islice(self._data, 0, stop - start) if x is not None]
+                else:
+                    records = []
+                    for index in range(start, stop, step):
+                        self.seek(index)
+                        record = next(self._data)
+                        if record:
+                            records.append(record)
+            else:
+                raise TypeError('Indices must be integers or slices')
+
+            self.seek(_current_pos)
+            if records is None:
+                raise self._index_error
+            return records
+        raise self._implement_error
+
+    _shifts = None
+    _implement_error = NotImplementedError('Indexable supported in unix-like o.s. and for files stored on disk')
+    _index_error = IndexError('Data block with requested index contain errors')
+
+
 class MOLwrite(CGRwrite):
     @staticmethod
     def _format_mol(atoms, bonds, cgr):