h2oai · carrascomj · Oct 28, 2021 · Oct 28, 2021 · Oct 30, 2021 · Oct 30, 2021
diff --git a/ci/appveyor.yml b/ci/appveyor.yml
@@ -99,7 +99,7 @@ build_script:
 
     python -m pip install --upgrade pip
 
-    python -m pip install $DT_WHEEL pytest docutils pandas
+    python -m pip install $DT_WHEEL pytest docutils pandas openpyxl 
 
     python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/
 
@@ -186,7 +186,7 @@ build_script:
 
       python -m pip install --upgrade pip
 
-      python -m pip install $DT_WHEEL pytest docutils pandas
+      python -m pip install $DT_WHEEL pytest docutils pandas openpyxl
 
       python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/
 
@@ -212,7 +212,7 @@ build_script:
 
       python -m pip install --upgrade pip
 
-      python -m pip install $DT_WHEEL pytest docutils pandas
+      python -m pip install $DT_WHEEL pytest docutils pandas openpyxl
 
       python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/
 
@@ -240,7 +240,7 @@ build_script:
 
     python -m pip install --upgrade pip
 
-    python -m pip install $DT_WHEEL pytest docutils pandas
+    python -m pip install $DT_WHEEL pytest docutils pandas openpyxl
 
     python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/
 
@@ -266,6 +266,6 @@ build_script:
 
     python -m pip install --upgrade pip
 
-    python -m pip install $DT_WHEEL pytest docutils pandas
+    python -m pip install $DT_WHEEL pytest docutils pandas openpyxl
 
     python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/
diff --git a/requirements_extra.txt b/requirements_extra.txt
@@ -1,4 +1,5 @@
 numpy
 pandas
 pyarrow
-xlrd<=1.2.0
+xlrd>=2.0.0
+openpyxl
diff --git a/src/datatable/utils/fread.py b/src/datatable/utils/fread.py
@@ -37,6 +37,7 @@
 from datatable.utils.misc import backticks_escape as escape
 from datatable.types import stype, ltype
 from datatable.xls import read_xls_workbook
+from datatable.xlsx import read_xlsx_workbook
 
 
 _url_regex = re.compile(r"(?:https?|ftp|file)://")
@@ -318,7 +319,13 @@ def _resolve_archive(filename, subpath, tempfiles):
             if logger:
                 logger.debug("Extracted: size = %d" % len(out_text))
 
-    elif ext == ".xlsx" or ext == ".xls":
+    elif ext == ".xlsx":
+        out_result = read_xlsx_workbook(filename, subpath)
+        if subpath:
+            filename = os.path.join(filename, subpath)
+
+
+    elif ext == ".xls":
         out_result = read_xls_workbook(filename, subpath)
         if subpath:
             filename = os.path.join(filename, subpath)

diff --git a/src/datatable/xls.py b/src/datatable/xls.py
@@ -23,11 +23,6 @@
 def read_xls_workbook(filename, subpath):
     try:
         import xlrd
-        # Fixes the warning
-        # "PendingDeprecationWarning: This method will be removed in future
-        #  versions.  Use 'tree.iter()' or 'list(tree.iter())' instead."
-        xlrd.xlsx.ensure_elementtree_imported(False, None)
-        xlrd.xlsx.Element_has_iter = True
     except ImportError:
         raise dt.exceptions.ImportError(
             "Module `xlrd` is required in order to read Excel file '%s'"

diff --git a/src/datatable/xlsx.py b/src/datatable/xlsx.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------------------------
+# Copyright 2018-2021 H2O.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#-------------------------------------------------------------------------------
+import datatable as dt
+
+
+
+def read_xlsx_workbook(filename, subpath):
+    try:
+        import openpyxl
+    except ImportError:
+        raise dt.exceptions.ImportError(
+            "Module `openpyxl` is required in order to read Excel file '%s'"
+            % filename)
+
+    if subpath:
+        wb = openpyxl.load_workbook(filename, data_only=True)
+        range2d = None
+        if subpath in wb.sheetnames:
+            sheetname = subpath
+        else:
+            if "/" in subpath:
+                sheetname, xlsrange = subpath.rsplit('/', 1)
+                range2d = xlsrange
+            if not(sheetname in wb.sheetnames and range2d is not None):
+                raise ValueError("Sheet `%s` is not found in the XLS file"
+                                 % subpath)
+        ws = wb[sheetname]
+        result = read_xlsx_worksheet(ws, range2d)
+    else:
+        wb = openpyxl.load_workbook(filename, data_only=True)
+        result = {}
+        for name, ws in zip(wb.sheetnames, wb):
+            out = read_xlsx_worksheet(ws)
+            if out is None:
+                continue
+            for i, frame in out.items():
+                result["%s/%s/%s" % (filename, name, i)] = frame
+
+    if len(result) == 0:
+        return None
+    elif len(result) == 1:
+        for v in result.values():
+            return v
+    else:
+        return result
+
+
+
+def read_xlsx_worksheet(ws, subrange=None):
+    if subrange is None:
+        ranges2d = [ws.calculate_dimension()]
+    else:
+        ranges2d = [subrange]
+
+    results = {}
+    for range2d in ranges2d:
+        subview = ws[range2d]
+        # the subview is a tuple of rows, which are tuples of columns
+        ncols = len(subview[0])
+        colnames = [str(n.value) for n in subview[0]]
+
+        outdata = [
+            [
+                row[i]._value if row[i].data_type != "e" else None
+                for row in subview[1:]
+            ]
+            for i in range(ncols)
+        ]
+        coltypes = [
+            str if any(isinstance(cell, str) for cell in col) else None
+            for col in outdata
+        ]
+
+        # let the frame decide on the types of non-str cols
+        types = [str if coltype == str else None for coltype in coltypes]
+        frame = dt.Frame(outdata, names=colnames, types=types)
+        results[range2d] = frame
+    return results
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -198,3 +198,15 @@ def param(n):
         return [param(n) for n in core.get_tests_in_suite(suite)]
     else:
         return [pytest.param(lambda: pytest.skip("C++ tests not compiled"))]
+
+
+def frame_to_xlsx(dt, filename):
+    openpyxl = pytest.importorskip("openpyxl")
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    for j, name in enumerate(dt.names):
+        _ = ws.cell(row=1, column=j + 1, value=name)
+    for i in range(dt.nrows):
+        for j in range(dt.ncols):
+            _ = ws.cell(row=i + 2, column=j + 1,  value=dt[i, j])
+    wb.save(filename=filename)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -179,6 +179,15 @@ def tempfile_csv():
         os.unlink(fname)
 
 
+@pytest.fixture(scope="function")
+def tempfile_xlsx():
+    fd, fname = mod_tempfile.mkstemp(suffix=".xlsx")
+    os.close(fd)
+    yield fname
+    if os.path.exists(fname):
+        os.unlink(fname)
+
+
 @pytest.fixture(scope="function")
 def tempdir():
     dirname = mod_tempfile.mkdtemp()

diff --git a/tests/fread/test-xls.py b/tests/fread/test-xls.py
@@ -18,7 +18,7 @@
 import pytest
 import random
 import datatable as dt
-from tests import find_file, assert_equals
+from tests import find_file, assert_equals, frame_to_xlsx
 from datatable.xls import (
     _parse_row, _combine_ranges, _process_merged_cells,
     _range2d_to_excel_coords, _excel_coords_to_range2d)
@@ -93,6 +93,52 @@ def test__excel_coords_to_range2d(seed):
         assert _excel_coords_to_range2d(excel_coords) == range2d
 
 
+def test_xlsx_simple(tempfile_xlsx):
+    dt0 = dt.Frame({"A": [-1, 7, 10000, 12],
+                    "B": [True, None, False, None],
+                    "C": ["alpha", "beta", None, "delta"]})
+    frame_to_xlsx(dt0, tempfile_xlsx)
+    dt1 = dt.Frame(tempfile_xlsx)
+    assert dt1.source == tempfile_xlsx
+    assert_equals(dt0, dt1)
+
+
+def test_xlsx_subpath(tempfile_xlsx):
+    dt0 = dt.Frame({"A": [-1, 7, 10000, 12],
+                    "B": [True, None, False, None],
+                    "C": ["alpha", "beta", None, "delta"]})
+    frame_to_xlsx(dt0, tempfile_xlsx)
+    sub_worksheet = tempfile_xlsx + "/Sheet/A1:B3"
+    dt1 = dt.Frame(sub_worksheet)
+    assert dt1.source == sub_worksheet
+    assert_equals(dt0[:2, :2], dt1)
+
+
+def test_xlsx_all_types(tempfile_xlsx):
+    from datetime import datetime
+    dt0 = dt.Frame([[True, False, None, True, True],
+                   [None, 1, -9, 12, 3],
+                   [4, 1346, 999, None, None],
+                   [591, 0, None, -395734, 19384709],
+                   [None, 777, 1093487019384, -384, None],
+                   [2.987, 3.45e-24, -0.189134e+12, 45982.1, None],
+                   [39408.301, 9.459027045e-125, 4.4508e+222, None, 3.14159],
+                   [datetime(2021, 5, 6), datetime(2019, 9, 6),
+                    datetime(2019, 5, 6), None, datetime(2000, 12, 6)],
+                   ["Life", "Liberty", "and", "Pursuit of Happiness", None],
+                   ["кохайтеся", "чорнобриві", ",", "та", "не з москалями"]
+                   ],
+                  stypes=[dt.bool8, dt.int32, dt.int32, dt.int32, dt.int64,
+                          dt.float64, dt.float64, dt.Type.time64, dt.str32,
+                          dt.str32],
+                  names=["b8", "i32", "2i32", "3i32", "4i32", "f64", "2f64",
+                         "time64", "s32", "2s32"])
+    frame_to_xlsx(dt0, tempfile_xlsx)
+    dt1 = dt.Frame(tempfile_xlsx)
+    assert dt1.source == tempfile_xlsx
+    assert_equals(dt0, dt1)
+
+
 def test_10k_diabetes_xlsx():
     filename = find_file("h2o-3", "fread", "10k_diabetes.xlsx")
     DT = dt.fread(filename)