Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to xlrd 2.0.0 + openpyxl #3191

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
10 changes: 5 additions & 5 deletions ci/appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ build_script:

python -m pip install --upgrade pip

python -m pip install $DT_WHEEL pytest docutils pandas
python -m pip install $DT_WHEEL pytest docutils pandas openpyxl

python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/

Expand Down Expand Up @@ -186,7 +186,7 @@ build_script:

python -m pip install --upgrade pip

python -m pip install $DT_WHEEL pytest docutils pandas
python -m pip install $DT_WHEEL pytest docutils pandas openpyxl

python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/

Expand All @@ -212,7 +212,7 @@ build_script:

python -m pip install --upgrade pip

python -m pip install $DT_WHEEL pytest docutils pandas
python -m pip install $DT_WHEEL pytest docutils pandas openpyxl

python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/

Expand Down Expand Up @@ -240,7 +240,7 @@ build_script:

python -m pip install --upgrade pip

python -m pip install $DT_WHEEL pytest docutils pandas
python -m pip install $DT_WHEEL pytest docutils pandas openpyxl

python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/

Expand All @@ -266,6 +266,6 @@ build_script:

python -m pip install --upgrade pip

python -m pip install $DT_WHEEL pytest docutils pandas
python -m pip install $DT_WHEEL pytest docutils pandas openpyxl

python -m pytest -ra --maxfail=10 -Werror -vv --showlocals ./tests/
3 changes: 2 additions & 1 deletion requirements_extra.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
pandas
pyarrow
xlrd<=1.2.0
xlrd>=2.0.0
openpyxl
9 changes: 8 additions & 1 deletion src/datatable/utils/fread.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from datatable.utils.misc import backticks_escape as escape
from datatable.types import stype, ltype
from datatable.xls import read_xls_workbook
from datatable.xlsx import read_xlsx_workbook


_url_regex = re.compile(r"(?:https?|ftp|file)://")
Expand Down Expand Up @@ -318,7 +319,13 @@ def _resolve_archive(filename, subpath, tempfiles):
if logger:
logger.debug("Extracted: size = %d" % len(out_text))

elif ext == ".xlsx" or ext == ".xls":
elif ext == ".xlsx":
out_result = read_xlsx_workbook(filename, subpath)
if subpath:
filename = os.path.join(filename, subpath)


elif ext == ".xls":
out_result = read_xls_workbook(filename, subpath)
if subpath:
filename = os.path.join(filename, subpath)
Expand Down
5 changes: 0 additions & 5 deletions src/datatable/xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,6 @@
def read_xls_workbook(filename, subpath):
try:
import xlrd
# Fixes the warning
# "PendingDeprecationWarning: This method will be removed in future
# versions. Use 'tree.iter()' or 'list(tree.iter())' instead."
xlrd.xlsx.ensure_elementtree_imported(False, None)
xlrd.xlsx.Element_has_iter = True
except ImportError:
raise dt.exceptions.ImportError(
"Module `xlrd` is required in order to read Excel file '%s'"
Expand Down
92 changes: 92 additions & 0 deletions src/datatable/xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
#-------------------------------------------------------------------------------
# Copyright 2018-2021 H2O.ai
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-------------------------------------------------------------------------------
import datatable as dt



def read_xlsx_workbook(filename, subpath):
try:
import openpyxl
except ImportError:
raise dt.exceptions.ImportError(
"Module `openpyxl` is required in order to read Excel file '%s'"
% filename)

if subpath:
wb = openpyxl.load_workbook(filename, data_only=True)
range2d = None
if subpath in wb.sheetnames:
sheetname = subpath
else:
if "/" in subpath:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Appveyor is failing in Windows because of the conversion to a path with backslashes.

  1. C:\Path\to\file.xlsx\Sheet\A2:B9 fails (also for xls files with the current xlrd-based parser).
  2. C:\Path\to\file.xlsx\Sheet/A2:B9 also fails, since there is not such a C:\Path\to\file.xlsx\Sheet sheet, aftersplitting.
  3. C:\Path\to\file.xlsx/Sheet/A2:B9 works.

The intended behavior in Windows is to work with (1), right? I guess both xlsx.py and xls.py should check for backslashes if no (sheetname, range2d) pair was found when a subpath is specified.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Windows support was added after the excel reading feature, so, you're right, we probably missed the backslash issue. My feeling is that on Windows we should support both types of slashes. What do you think?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I guess it would be more consistent for the user if they do not have to change the code for different platforms. I'll give it a try in a day.

Copy link
Contributor

@oleksiyskononenko oleksiyskononenko Jan 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This issue has been fixed as of #3220

I'll give it a try in a day.

Do you still want to run some benchmarks for this PR?

sheetname, xlsrange = subpath.rsplit('/', 1)
range2d = xlsrange
if not(sheetname in wb.sheetnames and range2d is not None):
raise ValueError("Sheet `%s` is not found in the XLS file"
% subpath)
ws = wb[sheetname]
result = read_xlsx_worksheet(ws, range2d)
else:
wb = openpyxl.load_workbook(filename, data_only=True)
result = {}
for name, ws in zip(wb.sheetnames, wb):
out = read_xlsx_worksheet(ws)
if out is None:
continue
for i, frame in out.items():
result["%s/%s/%s" % (filename, name, i)] = frame

if len(result) == 0:
return None
elif len(result) == 1:
for v in result.values():
return v
else:
return result



def read_xlsx_worksheet(ws, subrange=None):
if subrange is None:
ranges2d = [ws.calculate_dimension()]
else:
ranges2d = [subrange]

results = {}
for range2d in ranges2d:
subview = ws[range2d]
# the subview is a tuple of rows, which are tuples of columns
ncols = len(subview[0])
colnames = [str(n.value) for n in subview[0]]

outdata = [
[
row[i]._value if row[i].data_type != "e" else None
for row in subview[1:]
]
for i in range(ncols)
]
coltypes = [
str if any(isinstance(cell, str) for cell in col) else None
for col in outdata
]

# let the frame decide on the types of non-str cols
types = [str if coltype == str else None for coltype in coltypes]
frame = dt.Frame(outdata, names=colnames, types=types)
results[range2d] = frame
return results
12 changes: 12 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,15 @@ def param(n):
return [param(n) for n in core.get_tests_in_suite(suite)]
else:
return [pytest.param(lambda: pytest.skip("C++ tests not compiled"))]


def frame_to_xlsx(dt, filename):
openpyxl = pytest.importorskip("openpyxl")
wb = openpyxl.Workbook()
ws = wb.active
for j, name in enumerate(dt.names):
_ = ws.cell(row=1, column=j + 1, value=name)
for i in range(dt.nrows):
for j in range(dt.ncols):
_ = ws.cell(row=i + 2, column=j + 1, value=dt[i, j])
wb.save(filename=filename)
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,15 @@ def tempfile_csv():
os.unlink(fname)


@pytest.fixture(scope="function")
def tempfile_xlsx():
fd, fname = mod_tempfile.mkstemp(suffix=".xlsx")
os.close(fd)
yield fname
if os.path.exists(fname):
os.unlink(fname)


@pytest.fixture(scope="function")
def tempdir():
dirname = mod_tempfile.mkdtemp()
Expand Down
48 changes: 47 additions & 1 deletion tests/fread/test-xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import pytest
import random
import datatable as dt
from tests import find_file, assert_equals
from tests import find_file, assert_equals, frame_to_xlsx
from datatable.xls import (
_parse_row, _combine_ranges, _process_merged_cells,
_range2d_to_excel_coords, _excel_coords_to_range2d)
Expand Down Expand Up @@ -93,6 +93,52 @@ def test__excel_coords_to_range2d(seed):
assert _excel_coords_to_range2d(excel_coords) == range2d


def test_xlsx_simple(tempfile_xlsx):
dt0 = dt.Frame({"A": [-1, 7, 10000, 12],
"B": [True, None, False, None],
"C": ["alpha", "beta", None, "delta"]})
frame_to_xlsx(dt0, tempfile_xlsx)
dt1 = dt.Frame(tempfile_xlsx)
assert dt1.source == tempfile_xlsx
assert_equals(dt0, dt1)


def test_xlsx_subpath(tempfile_xlsx):
dt0 = dt.Frame({"A": [-1, 7, 10000, 12],
"B": [True, None, False, None],
"C": ["alpha", "beta", None, "delta"]})
frame_to_xlsx(dt0, tempfile_xlsx)
sub_worksheet = tempfile_xlsx + "/Sheet/A1:B3"
dt1 = dt.Frame(sub_worksheet)
assert dt1.source == sub_worksheet
assert_equals(dt0[:2, :2], dt1)


def test_xlsx_all_types(tempfile_xlsx):
from datetime import datetime
dt0 = dt.Frame([[True, False, None, True, True],
[None, 1, -9, 12, 3],
[4, 1346, 999, None, None],
[591, 0, None, -395734, 19384709],
[None, 777, 1093487019384, -384, None],
[2.987, 3.45e-24, -0.189134e+12, 45982.1, None],
[39408.301, 9.459027045e-125, 4.4508e+222, None, 3.14159],
[datetime(2021, 5, 6), datetime(2019, 9, 6),
datetime(2019, 5, 6), None, datetime(2000, 12, 6)],
["Life", "Liberty", "and", "Pursuit of Happiness", None],
["кохайтеся", "чорнобриві", ",", "та", "не з москалями"]
],
stypes=[dt.bool8, dt.int32, dt.int32, dt.int32, dt.int64,
dt.float64, dt.float64, dt.Type.time64, dt.str32,
dt.str32],
names=["b8", "i32", "2i32", "3i32", "4i32", "f64", "2f64",
"time64", "s32", "2s32"])
frame_to_xlsx(dt0, tempfile_xlsx)
dt1 = dt.Frame(tempfile_xlsx)
assert dt1.source == tempfile_xlsx
assert_equals(dt0, dt1)


def test_10k_diabetes_xlsx():
filename = find_file("h2o-3", "fread", "10k_diabetes.xlsx")
DT = dt.fread(filename)
Expand Down