Fixes for universal newline support by genfromtxt (where supported)

dhomeier · dhomeier · commit 995ec932bee6 · 2014-06-27T00:33:12.000+01:00
Changed genfromtxt's openers to text mode in Python3.3+ to use its
automatic newline recognition; added LZMA compression ('.xz').
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
@@ -53,8 +53,8 @@ class _FileOpeners(object):
     `_FileOpeners` contains a dictionary that holds one method for each
     supported file format. Attribute lookup is implemented in such a way that
     an instance of `_FileOpeners` itself can be indexed with the keys of that
-    dictionary. Currently uncompressed files as well as files
-    compressed with ``gzip`` or ``bz2`` compression are supported.
+    dictionary. Currently uncompressed files as well as files compressed with
+    ``gzip``, ``bz2`` or (from Python3.3 on) ``xz`` compression are supported.
 
     Notes
     -----
@@ -64,7 +64,7 @@ class _FileOpeners(object):
     Examples
     --------
     >>> np.lib._datasource._file_openers.keys()
-    [None, '.bz2', '.gz']
+    [None, '.bz2', '.xz', '.gz']
     >>> np.lib._datasource._file_openers['.gz'] is gzip.open
     True
 
@@ -75,9 +75,17 @@ def __init__(self):
     def _load(self):
         if self._loaded:
             return
+        try:
+            import lzma
+            self._file_openers[".xz"] = lzma.open
+        except ImportError:
+            pass
         try:
             import bz2
-            self._file_openers[".bz2"] = bz2.BZ2File
+            if sys.version_info[:2] < (3,3):
+                self._file_openers[".bz2"] = bz2.BZ2File
+            else:
+                self._file_openers[".bz2"] = bz2.open
         except ImportError:
             pass
         try:
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
@@ -1191,7 +1191,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     ----------
     fname : file or str
         File, filename, or generator to read.  If the filename extension is
-        `.gz` or `.bz2`, the file is first decompressed. Note that
+        `.gz`, `.bz2` or `.xz`, the file is first decompressed. Note that
         generators must return byte strings in Python 3k.
     dtype : dtype, optional
         Data type of the resulting array.
@@ -1354,9 +1354,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     try:
         if isinstance(fname, basestring):
             if sys.version_info[0] == 2:
-                fhd = iter(np.lib._datasource.open(fname, 'rbU'))
+                fhd = iter(np.lib._datasource.open(fname, 'rU'))
+            elif sys.version_info[1] < 3:
+                fhd = iter(np.lib._datasource.open(fname, 'r'))
             else:
-                fhd = iter(np.lib._datasource.open(fname, 'rb'))
+                fhd = iter(np.lib._datasource.open(fname, 'rt'))
             own_fhd = True
         else:
             fhd = iter(fname)
@@ -1391,7 +1393,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             if names is True:
                 if comments in first_line:
                     first_line = asbytes('').join(first_line.split(comments)[1:])
-            first_values = split_line(first_line)
+            first_values = split_line(asbytes(first_line))
     except StopIteration:
         # return an empty array if the datafile is empty
         first_line = asbytes('')
@@ -1611,7 +1613,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
 
     # Parse each line
     for (i, line) in enumerate(itertools.chain([first_line, ], fhd)):
-        values = split_line(line)
+        values = split_line(asbytes(line))
         nbvalues = len(values)
         # Skip an empty line
         if nbvalues == 0:
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
@@ -2,6 +2,9 @@
 
 import sys
 import gzip
+import bz2
+if sys.version_info[:2] > (3,2):
+    import lzma
 import os
 import threading
 import shutil
@@ -1608,8 +1611,54 @@ def test_recfromcsv(self):
     def test_gft_using_filename(self):
         # Test that we can load data from a filename as well as a file object
         wanted = np.arange(6).reshape((2, 3))
-        if sys.version_info[0] >= 3:
-            # python 3k is known to fail for '\r'
+        linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            f, name = mkstemp()
+            # We can't use NamedTemporaryFile on windows, because we cannot
+            # reopen the file.
+            try:
+                os.write(f, asbytes(data))
+                assert_array_equal(np.genfromtxt(name), wanted)
+            finally:
+                os.close(f)
+                os.unlink(name)
+
+    def test_gft_from_gzip(self):
+        # Test that we can load data from a gzipped file
+        wanted = np.arange(6).reshape((2, 3))
+        if sys.version_info[:2] < (3,3):
+            # universal newline conversion does not work for unknown reasons
+            # in 2.7, not supported (mode='rt') in 3.2
+            linesep = ('\n', '\r\n')
+        else:
+            linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            s = BytesIO()
+            g = gzip.GzipFile(fileobj=s, mode='w')
+            g.write(asbytes(data))
+            g.close()
+            s.seek(0)
+
+            f, name = mkstemp(suffix='.gz')
+            # We can't use NamedTemporaryFile on windows, because we cannot
+            # reopen the file.
+            try:
+                os.write(f, s.read())
+                s.close()
+                assert_array_equal(np.genfromtxt(name), wanted)
+            finally:
+                os.close(f)
+                os.unlink(name)
+
+    def test_gft_from_bzip2(self):
+        # Test that we can load data from a bzip2 file
+        wanted = np.arange(6).reshape((2, 3))
+        if sys.version_info[0] == 3 and sys.version_info[1] < 3:
+            # universal newline conversion not supported (mode='rt') in 3.2
             linesep = ('\n', '\r\n')
         else:
             linesep = ('\n', '\r\n', '\r')
@@ -1621,6 +1670,34 @@ def test_gft_using_filename(self):
             # reopen the file.
             try:
                 os.write(f, asbytes(data))
+                os.system('bzip2 %s' % name)
+                assert_array_equal(np.genfromtxt(name+'.bz2'), wanted)
+            finally:
+                os.system('bzip2 -d %s.bz2' % name)
+                os.close(f)
+                os.unlink(name)
+
+    @np.testing.dec.knownfailureif(sys.version_info[:2] < (3,3),
+                                   "LZMA not supported in Python<3.3")
+    def test_gft_from_xz(self):
+        # Test that we can load data from an xz file
+        wanted = np.arange(6).reshape((2, 3))
+        linesep = ('\n', '\r\n', '\r')
+
+        for sep in linesep:
+            data = '0 1 2' + sep + '3 4 5'
+            s = BytesIO()
+            g = lzma.LZMAFile(s, mode='w')
+            g.write(asbytes(data))
+            g.close()
+            s.seek(0)
+
+            f, name = mkstemp(suffix='.xz')
+            # We can't use NamedTemporaryFile on windows, because we cannot
+            # reopen the file.
+            try:
+                os.write(f, s.read())
+                s.close()
                 assert_array_equal(np.genfromtxt(name), wanted)
             finally:
                 os.close(f)