Merge 1cb2d56 into 04323f0

frictionlessdata · May 27, 2020 · 2dd2812 · 2dd2812
2 parents 04323f0 + 1cb2d56
commit 2dd2812
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 10 deletions.
diff --git a/data/special/sheets.xlsx b/data/special/sheets.xlsx
diff --git a/tabulator/loaders/aws.py b/tabulator/loaders/aws.py
@@ -22,6 +22,7 @@ class AWSLoader(Loader):
 
     # Public
 
+    remote = True
     options = [
         's3_endpoint_url',
     ]

diff --git a/tabulator/loaders/remote.py b/tabulator/loaders/remote.py
@@ -21,6 +21,7 @@ class RemoteLoader(Loader):
 
     # Public
 
+    remote = True
     options = [
         'http_session',
         'http_stream',

diff --git a/tabulator/parsers/xlsx.py b/tabulator/parsers/xlsx.py
@@ -4,12 +4,15 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
+import os
+import io
 import six
 import shutil
+import atexit
 import openpyxl
 import datetime
 from itertools import chain
-from tempfile import TemporaryFile
+from tempfile import NamedTemporaryFile
 from ..parser import Parser
 from .. import exceptions
 from .. import helpers
@@ -25,16 +28,18 @@ class XLSXParser(Parser):
 
     options = [
         'sheet',
+        'workbook_cache',
         'fill_merged_cells',
         'preserve_formatting',
         'adjust_floating_point_error',
     ]
 
-    def __init__(self, loader, force_parse=False, sheet=1,
+    def __init__(self, loader, force_parse=False, sheet=1, workbook_cache=None,
             fill_merged_cells=False, preserve_formatting=False,
             adjust_floating_point_error=False):
         self.__loader = loader
         self.__sheet_pointer = sheet
+        self.__workbook_cache = workbook_cache
         self.__fill_merged_cells = fill_merged_cells
         self.__preserve_formatting = preserve_formatting
         self.__adjust_floating_point_error = adjust_floating_point_error
@@ -51,17 +56,32 @@ def closed(self):
     def open(self, source, encoding=None):
         self.close()
         self.__encoding = encoding
-        self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
 
+        # Remote
         # Create copy for remote source
         # For remote stream we need local copy (will be deleted on close by Python)
         # https://docs.python.org/3.5/library/tempfile.html#tempfile.TemporaryFile
-        if getattr(self.__bytes, 'remote', False):
-            new_bytes = TemporaryFile()
-            shutil.copyfileobj(self.__bytes, new_bytes)
-            self.__bytes.close()
-            self.__bytes = new_bytes
-            self.__bytes.seek(0)
+        if getattr(self.__loader, 'remote', False):
+            # Cached
+            if self.__workbook_cache is not None and source in self.__workbook_cache:
+                self.__bytes = io.open(self.__workbook_cache[source], 'rb')
+            # Not cached
+            else:
+                prefix = 'tabulator-'
+                delete = self.__workbook_cache is None
+                source_bytes = self.__loader.load(source, mode='b', encoding=encoding)
+                target_bytes = NamedTemporaryFile(prefix=prefix, delete=delete)
+                shutil.copyfileobj(source_bytes, target_bytes)
+                source_bytes.close()
+                target_bytes.seek(0)
+                self.__bytes = target_bytes
+                if self.__workbook_cache is not None:
+                    self.__workbook_cache[source] = target_bytes.name
+                    atexit.register(os.remove, target_bytes.name)
+
+        # Local
+        else:
+            self.__bytes = self.__loader.load(source, mode='b', encoding=encoding)
 
         # Get book
         # To fill merged cells we can't use read-only because

diff --git a/tests/formats/test_xlsx.py b/tests/formats/test_xlsx.py
@@ -113,6 +113,15 @@ def test_stream_xlsx_preserve_formatting():
         }]
 
 
+def test_stream_xlsx_workbook_cache():
+    workbook_cache = {}
+    source = BASE_URL % 'data/special/sheets.xlsx'
+    for sheet in ['Sheet1', 'Sheet2', 'Sheet3']:
+        with Stream(source, sheet=sheet, workbook_cache=workbook_cache) as stream:
+            assert len(workbook_cache) == 1
+            assert stream.read()
+
+
 # Write
 
 def test_stream_save_xlsx(tmpdir):

diff --git a/tests/test_stream.py b/tests/test_stream.py
@@ -108,7 +108,6 @@ def test_stream_headers_inline_keyed_headers_is_none():
 def test_stream_headers_xls_multiline():
     source = 'data/special/multiline-headers.xlsx'
     with Stream(source, headers=[1, 5], fill_merged_cells=True) as stream:
-        print(stream.headers)
         assert stream.headers == [
             'Region',
             'Caloric contribution (%)',