From bb8825f4ec907b2ac4e5ea01794c38d658bf5363 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 3 Nov 2018 23:08:27 -0700 Subject: [PATCH] BUG: Delegate more of Excel parsing to CSV The idea is that we read the Excel file, get the data, and then let the TextParser handle the reading and parsing. We shouldn't be doing a lot of work that is already defined in parsers.py In doing so, we identified several bugs: * index_col=None was not being respected * usecols behavior was inconsistent with that of read_csv for list of strings and callable inputs * usecols was not being validated as proper Excel column names when passed as a string. Closes gh-18273. Closes gh-20480. --- doc/source/io.rst | 17 + doc/source/whatsnew/v0.24.0.txt | 3 + pandas/io/excel.py | 194 ++++--- pandas/tests/io/test_excel.py | 955 +++++++++++++++++--------------- 4 files changed, 659 insertions(+), 510 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 68faefa872c88..9a2f59df65386 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2867,6 +2867,23 @@ indices to be parsed. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. +If ``usecols`` is a list of strings, it is assumed that each string corresponds +to a column name provided either by the user in ``names`` or inferred from the +document header row(s). Those strings define which columns will be parsed: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + +Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. + +If ``usecols`` is callable, the callable function will be evaluated against +the column names, returning names where the callable function evaluates to ``True``. + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + Parsing Dates +++++++++++++ diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 149d618c4a621..e58bc3b08a306 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -236,6 +236,7 @@ Other Enhancements - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`) +- :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) .. _whatsnew_0240.api_breaking: @@ -1289,6 +1290,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) - Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) +- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) +- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 7a7b801f4ba4a..4261f111ca7fa 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -17,8 +17,7 @@ import pandas._libs.json as json import pandas.compat as compat from pandas.compat import ( - OrderedDict, add_metaclass, lrange, map, range, reduce, string_types, u, - zip) + OrderedDict, add_metaclass, lrange, map, range, string_types, u, zip) from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_kwarg @@ -93,13 +92,22 @@ .. deprecated:: 0.21.0 Pass in `usecols` instead. -usecols : int or list, default None - * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed - * If string then indicates comma separated list of Excel column letters and - column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of +usecols : int, str, list-like, or callable default None + * If None, then parse all columns, + * If int, then indicates last column to be parsed + * If string, then indicates comma separated list of Excel column letters + and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. + * If list of ints, then indicates list of column numbers to be parsed. + * If list of strings, then indicates list of column names to be parsed. + + .. versionadded:: 0.24.0 + + * If callable, then evaluate each column name against it and parse the + column if the callable returns ``True``. + + .. versionadded:: 0.24.0 + squeeze : boolean, default False If the parsed data only contains one column then return a Series dtype : Type name or dict of column -> type, default None @@ -466,39 +474,6 @@ def parse(self, convert_float=convert_float, **kwds) - def _should_parse(self, i, usecols): - - def _range2cols(areas): - """ - Convert comma separated list of column names and column ranges to a - list of 0-based column indexes. - - >>> _range2cols('A:E') - [0, 1, 2, 3, 4] - >>> _range2cols('A,C,Z:AB') - [0, 2, 25, 26, 27] - """ - def _excel2num(x): - "Convert Excel column name like 'AB' to 0-based column index" - return reduce(lambda s, a: s * 26 + ord(a) - ord('A') + 1, - x.upper().strip(), 0) - 1 - - cols = [] - for rng in areas.split(','): - if ':' in rng: - rng = rng.split(':') - cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1) - else: - cols.append(_excel2num(rng)) - return cols - - if isinstance(usecols, int): - return i <= usecols - elif isinstance(usecols, compat.string_types): - return i in _range2cols(usecols) - else: - return i in usecols - def _parse_excel(self, sheet_name=0, header=0, @@ -527,10 +502,6 @@ def _parse_excel(self, raise NotImplementedError("chunksize keyword of read_excel " "is not implemented") - if parse_dates is True and index_col is None: - warnings.warn("The 'parse_dates=True' keyword of read_excel was " - "provided without an 'index_col' keyword value.") - import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, @@ -620,17 +591,13 @@ def _parse_cell(cell_contents, cell_typ): sheet = self.book.sheet_by_index(asheetname) data = [] - should_parse = {} + usecols = _maybe_convert_usecols(usecols) for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): - if usecols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, usecols) - - if usecols is None or should_parse[j]: - row.append(_parse_cell(value, typ)) + row.append(_parse_cell(value, typ)) data.append(row) if sheet.nrows == 0: @@ -642,24 +609,22 @@ def _parse_cell(cell_contents, cell_typ): # forward fill and pull out names for MultiIndex column header_names = None - if header is not None: - if is_list_like(header): - header_names = [] - control_row = [True] * len(data[0]) - for row in header: - if is_integer(skiprows): - row += skiprows - - data[row], control_row = _fill_mi_header( - data[row], control_row) - header_name, data[row] = _pop_header_name( - data[row], index_col) - header_names.append(header_name) - else: - data[header] = _trim_excel_header(data[header]) + if header is not None and is_list_like(header): + header_names = [] + control_row = [True] * len(data[0]) + + for row in header: + if is_integer(skiprows): + row += skiprows + + data[row], control_row = _fill_mi_header( + data[row], control_row) + header_name, _ = _pop_header_name( + data[row], index_col) + header_names.append(header_name) if is_list_like(index_col): - # forward fill values for MultiIndex index + # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: @@ -667,6 +632,7 @@ def _parse_cell(cell_contents, cell_typ): for col in index_col: last = data[offset][col] + for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last @@ -693,11 +659,14 @@ def _parse_cell(cell_contents, cell_typ): thousands=thousands, comment=comment, skipfooter=skipfooter, + usecols=usecols, **kwds) output[asheetname] = parser.read(nrows=nrows) + if names is not None: output[asheetname].columns = names + if not squeeze or isinstance(output[asheetname], DataFrame): output[asheetname].columns = output[ asheetname].columns.set_names(header_names) @@ -726,6 +695,97 @@ def __exit__(self, exc_type, exc_value, traceback): self.close() +def _excel2num(x): + """ + Convert Excel column name like 'AB' to 0-based column index. + + Parameters + ---------- + x : str + The Excel column name to convert to a 0-based column index. + + Returns + ------- + num : int + The column index corresponding to the name. + + Raises + ------ + ValueError + Part of the Excel column name was invalid. + """ + index = 0 + + for c in x.upper().strip(): + cp = ord(c) + + if cp < ord("A") or cp > ord("Z"): + raise ValueError("Invalid column name: {x}".format(x=x)) + + index = index * 26 + cp - ord("A") + 1 + + return index - 1 + + +def _range2cols(areas): + """ + Convert comma separated list of column names and ranges to indices. + + Parameters + ---------- + areas : str + A string containing a sequence of column ranges (or areas). + + Returns + ------- + cols : list + A list of 0-based column indices. + + Examples + -------- + >>> _range2cols('A:E') + [0, 1, 2, 3, 4] + >>> _range2cols('A,C,Z:AB') + [0, 2, 25, 26, 27] + """ + cols = [] + + for rng in areas.split(","): + if ":" in rng: + rng = rng.split(":") + cols += lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1) + else: + cols.append(_excel2num(rng)) + + return cols + + +def _maybe_convert_usecols(usecols): + """ + Convert `usecols` into a compatible format for parsing in `parsers.py`. + + Parameters + ---------- + usecols : object + The use-columns object to potentially convert. + + Returns + ------- + converted : object + The compatible format of `usecols`. + """ + if usecols is None: + return usecols + + if isinstance(usecols, int): + return lrange(usecols + 1) + + if isinstance(usecols, compat.string_types): + return _range2cols(usecols) + + return usecols + + def _validate_freeze_panes(freeze_panes): if freeze_panes is not None: if ( diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 1bd2fb5887e38..8a40b6e1e6e62 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -179,6 +179,65 @@ def test_usecols_str(self, ext): tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) + @pytest.mark.parametrize("usecols", [ + [0, 1, 3], [0, 3, 1], + [1, 0, 3], [1, 3, 0], + [3, 0, 1], [3, 1, 0], + ]) + def test_usecols_diff_positional_int_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["A", "C"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.parametrize("usecols", [ + ["B", "D"], ["D", "B"] + ]) + def test_usecols_diff_positional_str_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["B", "D"]] + expected.index = range(len(expected)) + + result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_read_excel_without_slicing(self, ext): + expected = self.get_csv_refdf("test1") + result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str(self, ext): + expected = self.get_csv_refdf("test1")[["C", "D"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols="A,D:E") + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str_invalid(self, ext): + msg = "Invalid column name: E1" + + with tm.assert_raises_regex(ValueError, msg): + self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") + + def test_index_col_label_error(self, ext): + msg = "list indices must be integers.*, not str" + + with tm.assert_raises_regex(TypeError, msg): + self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], + usecols=["A", "C"]) + + def test_usecols_pass_non_existent_column(self, ext): + msg = ("Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]") + + with tm.assert_raises_regex(ValueError, msg): + self.get_exceldf("test1", ext, usecols=["E"]) + + def test_usecols_wrong_type(self, ext): + msg = ("'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable.") + + with tm.assert_raises_regex(ValueError, msg): + self.get_exceldf("test1", ext, usecols=["E1", 0]) + def test_excel_stop_iterator(self, ext): parsed = self.get_exceldf('test2', ext, 'Sheet1') @@ -446,63 +505,48 @@ def test_read_excel_blank_with_header(self, ext): actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') tm.assert_frame_equal(actual, expected) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - # GH 12292 : error when read one empty column from excel file - def test_read_one_empty_col_no_header(self, ext): + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) + with ensure_clean(ext) as path: - df.to_excel(path, 'no_header', index=False, header=False) - actual_header_none = read_excel( - path, - 'no_header', - usecols=[0], - header=None - ) - - actual_header_zero = read_excel( - path, - 'no_header', - usecols=[0], - header=0 - ) - expected = DataFrame() - tm.assert_frame_equal(actual_header_none, expected) - tm.assert_frame_equal(actual_header_zero, expected) + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_read_one_empty_col_with_header(self, ext): + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" df = pd.DataFrame( [["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]] ) + with ensure_clean(ext) as path: df.to_excel(path, 'with_header', index=False, header=True) - actual_header_none = read_excel( - path, - 'with_header', - usecols=[0], - header=None - ) - - actual_header_zero = read_excel( - path, - 'with_header', - usecols=[0], - header=0 - ) - expected_header_none = DataFrame(pd.Series([0], dtype='int64')) - tm.assert_frame_equal(actual_header_none, expected_header_none) - expected_header_zero = DataFrame(columns=[0]) - tm.assert_frame_equal(actual_header_zero, expected_header_zero) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) @td.skip_if_no('openpyxl') @td.skip_if_no('xlwt') @@ -539,29 +583,33 @@ def test_date_conversion_overflow(self, ext): result = self.get_exceldf('testdateoverflow', ext) tm.assert_frame_equal(result, expected) - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_sheet_name_and_sheetname(self, ext): - # GH10559: Minor improvement: Change "sheet_name" to "sheetname" - # GH10969: DOC: Consistent var names (sheetname vs sheet_name) - # GH12604: CLN GH10559 Rename sheetname variable to sheet_name - # GH20920: ExcelFile.parse() and pd.read_xlsx() have different - # behavior for "sheetname" argument - dfref = self.get_csv_refdf('test1') - df1 = self.get_exceldf('test1', ext, - sheet_name='Sheet1') # doc + # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" + # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) + # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name + # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different + # behavior for "sheetname" argument + filename = "test1" + sheet_name = "Sheet1" + + df_ref = self.get_csv_refdf(filename) + df1 = self.get_exceldf(filename, ext, + sheet_name=sheet_name, index_col=0) # doc with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2 = self.get_exceldf('test1', ext, - sheetname='Sheet1') # bkwrd compat + df2 = self.get_exceldf(filename, ext, index_col=0, + sheetname=sheet_name) # backward compat - excel = self.get_excelfile('test1', ext) - df1_parse = excel.parse(sheet_name='Sheet1') # doc + excel = self.get_excelfile(filename, ext) + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(sheetname='Sheet1') # bkwrd compat + df2_parse = excel.parse(index_col=0, + sheetname=sheet_name) # backward compat - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df1_parse, dfref, check_names=False) - tm.assert_frame_equal(df2_parse, dfref, check_names=False) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, df_ref, check_names=False) + tm.assert_frame_equal(df2_parse, df_ref, check_names=False) def test_sheet_name_both_raises(self, ext): with tm.assert_raises_regex(TypeError, "Cannot specify both"): @@ -594,20 +642,24 @@ def test_excel_read_buffer(self, ext): actual = read_excel(xls, 'Sheet1', index_col=0) tm.assert_frame_equal(expected, actual) - @td.skip_if_no('xlwt') - def test_read_xlrd_Book(self, ext): + @td.skip_if_no("xlwt") + def test_read_xlrd_book(self, ext): import xlrd - df = self.frame - with ensure_clean('.xls') as pth: - df.to_excel(pth, "SheetA") + + engine = "xlrd" + sheet_name = "SheetA" + + with ensure_clean(ext) as pth: + df.to_excel(pth, sheet_name) book = xlrd.open_workbook(pth) - with ExcelFile(book, engine="xlrd") as xl: - result = read_excel(xl, "SheetA") + with ExcelFile(book, engine=engine) as xl: + result = read_excel(xl, sheet_name, index_col=0) tm.assert_frame_equal(df, result) - result = read_excel(book, sheet_name="SheetA", engine="xlrd") + result = read_excel(book, sheet_name=sheet_name, + engine=engine, index_col=0) tm.assert_frame_equal(df, result) @tm.network @@ -618,17 +670,18 @@ def test_read_from_http_url(self, ext): local_table = self.get_exceldf('test1', ext) tm.assert_frame_equal(url_table, local_table) - @td.skip_if_no('s3fs') + @td.skip_if_no("s3fs") @td.skip_if_not_us_locale def test_read_from_s3_url(self, ext): - boto3 = pytest.importorskip('boto3') - moto = pytest.importorskip('moto') + moto = pytest.importorskip("moto") + boto3 = pytest.importorskip("boto3") with moto.mock_s3(): conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket="pandas-test") file_name = os.path.join(self.dirpath, 'test1' + ext) - with open(file_name, 'rb') as f: + + with open(file_name, "rb") as f: conn.Bucket("pandas-test").put_object(Key="test1" + ext, Body=f) @@ -695,17 +748,18 @@ def test_reader_closes_file(self, ext): assert f.closed - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") def test_creating_and_reading_multiple_sheets(self, ext): - # Test reading multiple sheets, from a runtime created excel file - # with multiple sheets. - # See PR #9450 - def tdf(sheetname): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[sheetname]) + return DataFrame(d, i, columns=[col_sheet_name]) - sheets = ['AAA', 'BBB', 'CCC'] + sheets = ["AAA", "BBB", "CCC"] dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets, dfs)) @@ -714,7 +768,9 @@ def tdf(sheetname): with ExcelWriter(pth) as ew: for sheetname, df in iteritems(dfs): df.to_excel(ew, sheetname) - dfs_returned = read_excel(pth, sheet_name=sheets) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) @@ -756,206 +812,206 @@ def test_reader_seconds(self, ext): tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, ext): - # GH 4679 - mi = MultiIndex.from_product([['foo', 'bar'], ['a', 'b']]) - mi_file = os.path.join(self.dirpath, 'testmultiindex' + ext) - - expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], + # see gh-4679 + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + + # "mi_column" sheet + expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True]], columns=mi) - actual = read_excel(mi_file, 'mi_column', header=[0, 1]) - tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'mi_column', header=[0, 1], index_col=0) + actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - expected.columns = ['a', 'b', 'c', 'd'] + # "mi_index" sheet expected.index = mi - actual = read_excel(mi_file, 'mi_index', index_col=[0, 1]) + expected.columns = ["a", "b", "c", "d"] + + actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) + # "both" sheet expected.columns = mi - actual = read_excel(mi_file, 'both', index_col=[0, 1], header=[0, 1]) + + actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) - expected.index = mi.set_names(['ilvl1', 'ilvl2']) - expected.columns = ['a', 'b', 'c', 'd'] - actual = read_excel(mi_file, 'mi_index_name', index_col=[0, 1]) + # "mi_index_name" sheet + expected.columns = ["a", "b", "c", "d"] + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) + # "mi_column_name" sheet expected.index = list(range(4)) - expected.columns = mi.set_names(['c1', 'c2']) - actual = read_excel(mi_file, 'mi_column_name', + expected.columns = mi.set_names(["c1", "c2"]) + actual = read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - # Issue #11317 + # see gh-11317 + # "name_with_int" sheet expected.columns = mi.set_levels( - [1, 2], level=1).set_names(['c1', 'c2']) - actual = read_excel(mi_file, 'name_with_int', + [1, 2], level=1).set_names(["c1", "c2"]) + + actual = read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) - expected.columns = mi.set_names(['c1', 'c2']) - expected.index = mi.set_names(['ilvl1', 'ilvl2']) - actual = read_excel(mi_file, 'both_name', - index_col=[0, 1], header=[0, 1]) - tm.assert_frame_equal(actual, expected) + # "both_name" sheet + expected.columns = mi.set_names(["c1", "c2"]) + expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = read_excel(mi_file, 'both_name', + actual = read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'both_name_skiprows', index_col=[0, 1], + # "both_skiprows" sheet + actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) - @td.skip_if_no('xlsxwriter') + @td.skip_if_no("xlsxwriter") def test_read_excel_multiindex_empty_level(self, ext): - # GH 12453 - with ensure_clean('.xlsx') as path: + # see gh-12453 + with ensure_clean(ext) as path: df = DataFrame({ - ('One', 'x'): {0: 1}, - ('Two', 'X'): {0: 3}, - ('Two', 'Y'): {0: 7}, - ('Zero', ''): {0: 0} + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} }) expected = DataFrame({ - ('One', u'x'): {0: 1}, - ('Two', u'X'): {0: 3}, - ('Two', u'Y'): {0: 7}, - ('Zero', 'Unnamed: 3_level_1'): {0: 0} + ("One", u"x"): {0: 1}, + ("Two", u"X"): {0: 3}, + ("Two", u"Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} }) df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1]) + actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) df = pd.DataFrame({ - ('Beg', ''): {0: 0}, - ('Middle', 'x'): {0: 1}, - ('Tail', 'X'): {0: 3}, - ('Tail', 'Y'): {0: 7} + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} }) expected = pd.DataFrame({ - ('Beg', 'Unnamed: 0_level_1'): {0: 0}, - ('Middle', u'x'): {0: 1}, - ('Tail', u'X'): {0: 3}, - ('Tail', u'Y'): {0: 7} + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", u"x"): {0: 1}, + ("Tail", u"X"): {0: 3}, + ("Tail", u"Y"): {0: 7} }) df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1]) + actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - @td.skip_if_no('xlsxwriter') - def test_excel_multindex_roundtrip(self, ext): - # GH 4679 - with ensure_clean('.xlsx') as pth: - for c_idx_names in [True, False]: - for r_idx_names in [True, False]: - for c_idx_levels in [1, 3]: - for r_idx_levels in [1, 3]: - # column index name can't be serialized unless - # MultiIndex - if (c_idx_levels == 1 and c_idx_names): - continue - - # empty name case current read in as unnamed - # levels, not Nones - check_names = True - if not r_idx_names and r_idx_levels > 1: - check_names = False - - df = mkdf(5, 5, c_idx_names, - r_idx_names, c_idx_levels, - r_idx_levels) - df.to_excel(pth) - act = pd.read_excel( - pth, index_col=list(range(r_idx_levels)), + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) - tm.assert_frame_equal( - df, act, check_names=check_names) + tm.assert_frame_equal(df, act, check_names=check_names) - df.iloc[0, :] = np.nan - df.to_excel(pth) - act = pd.read_excel( - pth, index_col=list(range(r_idx_levels)), + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) - tm.assert_frame_equal( - df, act, check_names=check_names) + tm.assert_frame_equal(df, act, check_names=check_names) - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel( - pth, index_col=list(range(r_idx_levels)), + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels))) - tm.assert_frame_equal( - df, act, check_names=check_names) + tm.assert_frame_equal(df, act, check_names=check_names) def test_excel_old_index_format(self, ext): # see gh-4679 - filename = 'test_index_name_pre17' + ext + filename = "test_index_name_pre17" + ext in_file = os.path.join(self.dirpath, filename) # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. data = np.array([[None, None, None, None, None], - ['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], - ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], - ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], - ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], - ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) - columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] - mi = MultiIndex(levels=[['R0', 'R_l0_g0', 'R_l0_g1', - 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], - ['R1', 'R_l1_g0', 'R_l1_g1', - 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", + "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", + "R_l1_g2", "R_l1_g3", "R_l1_g4"]], labels=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], names=[None, None]) - si = Index(['R0', 'R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], name=None) + si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) - actual = pd.read_excel(in_file, 'single_names') + actual = pd.read_excel(in_file, "single_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi - actual = pd.read_excel(in_file, 'multi_names') + actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # The analogous versions of the "names" version data # where there are explicitly no names for the indices. - data = np.array([['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], - ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], - ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], - ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], - ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) - columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] - mi = MultiIndex(levels=[['R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], - ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', - 'R_l1_g3', 'R_l1_g4']], + data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", + "R_l1_g3", "R_l1_g4"]], labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=[None, None]) - si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], name=None) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) - actual = pd.read_excel(in_file, 'single_no_names') + actual = pd.read_excel(in_file, "single_no_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi - actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1]) + actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) def test_read_excel_bool_header_arg(self, ext): @@ -971,33 +1027,28 @@ def test_read_excel_chunksize(self, ext): pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), chunksize=100) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") def test_read_excel_parse_dates(self, ext): - # GH 11544, 12051 + # see gh-11544, gh-12051 df = DataFrame( - {'col': [1, 2, 3], - 'date_strings': pd.date_range('2012-01-01', periods=3)}) + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) df2 = df.copy() - df2['date_strings'] = df2['date_strings'].dt.strftime('%m/%d/%Y') + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") with ensure_clean(ext) as pth: df2.to_excel(pth) - res = read_excel(pth) + res = read_excel(pth, index_col=0) tm.assert_frame_equal(df2, res) - # no index_col specified when parse_dates is True - with tm.assert_produces_warning(): - res = read_excel(pth, parse_dates=True) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=['date_strings'], index_col=0) + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) - dateparser = lambda x: pd.datetime.strptime(x, '%m/%d/%Y') - res = read_excel(pth, parse_dates=['date_strings'], - date_parser=dateparser, index_col=0) + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) tm.assert_frame_equal(df, res) def test_read_excel_skiprows_list(self, ext): @@ -1106,26 +1157,29 @@ class and any subclasses, on account of the `autouse=True` class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. - def test_excel_sheet_by_name_raise(self, merge_cells, engine, ext): + def test_excel_sheet_by_name_raise(self, *_): import xlrd gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(self.path) + xl = ExcelFile(self.path) - df = read_excel(xl, 0) + df = read_excel(xl, 0, index_col=0) + tm.assert_frame_equal(gt, df) with pytest.raises(xlrd.XLRDError): - read_excel(xl, '0') + read_excel(xl, "0") - def test_excelwriter_contextmanager(self, merge_cells, engine, ext): + def test_excel_writer_context_manager(self, *_): with ExcelWriter(self.path) as writer: - self.frame.to_excel(writer, 'Data1') - self.frame2.to_excel(writer, 'Data2') + self.frame.to_excel(writer, "Data1") + self.frame2.to_excel(writer, "Data2") with ExcelFile(self.path) as reader: - found_df = read_excel(reader, 'Data1') - found_df2 = read_excel(reader, 'Data2') + found_df = read_excel(reader, "Data1", index_col=0) + found_df2 = read_excel(reader, "Data2", index_col=0) + tm.assert_frame_equal(found_df, self.frame) tm.assert_frame_equal(found_df2, self.frame2) @@ -1182,12 +1236,13 @@ def test_mixed(self, merge_cells, engine, ext): recons = read_excel(reader, 'test1', index_col=0) tm.assert_frame_equal(self.mixed_frame, recons) - def test_tsframe(self, merge_cells, engine, ext): + def test_ts_frame(self, *_): df = tm.makeTimeDataFrame()[:5] - df.to_excel(self.path, 'test1') + df.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) def test_basics_with_nan(self, merge_cells, engine, ext): @@ -1200,21 +1255,25 @@ def test_basics_with_nan(self, merge_cells, engine, ext): @pytest.mark.parametrize("np_type", [ np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, merge_cells, engine, ext, np_type): - # Test np.int values read come back as int (rather than float - # which is Excel's format). + # Test np.int values read come back as int + # (rather than float which is Excel's format). frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + recons = read_excel(reader, "test1", index_col=0) + int_frame = frame.astype(np.int64) tm.assert_frame_equal(int_frame, recons) - recons2 = read_excel(self.path, 'test1') + + recons2 = read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) - # test with convert_float=False comes back as float + # Test with convert_float=False comes back as float. float_frame = frame.astype(float) - recons = read_excel(self.path, 'test1', convert_float=False) + recons = read_excel(self.path, "test1", + convert_float=False, index_col=0) tm.assert_frame_equal(recons, float_frame, check_index_type=False, check_column_type=False) @@ -1224,25 +1283,31 @@ def test_int_types(self, merge_cells, engine, ext, np_type): def test_float_types(self, merge_cells, engine, ext, np_type): # Test np.float values read come back as float. frame = DataFrame(np.random.random_sample(10), dtype=np_type) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1').astype(np_type) + recons = read_excel(reader, "test1", index_col=0).astype(np_type) + tm.assert_frame_equal(frame, recons, check_dtype=False) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, merge_cells, engine, ext, np_type): # Test np.bool values read come back as float. frame = (DataFrame([1, 0, True, False], dtype=np_type)) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1').astype(np_type) + recons = read_excel(reader, "test1", index_col=0).astype(np_type) + tm.assert_frame_equal(frame, recons) - def test_inf_roundtrip(self, merge_cells, engine, ext): + def test_inf_roundtrip(self, *_): frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - frame.to_excel(self.path, 'test1') + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(frame, recons) def test_sheets(self, merge_cells, engine, ext): @@ -1353,37 +1418,41 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): tm.assert_frame_equal(result, df) assert result.index.name == 'foo' - def test_excel_roundtrip_datetime(self, merge_cells, engine, ext): + def test_excel_roundtrip_datetime(self, merge_cells, *_): # datetime.date, not sure what to test here exactly tsf = self.tsframe.copy() tsf.index = [x.date() for x in self.tsframe.index] - tsf.to_excel(self.path, 'test1', merge_cells=merge_cells) + tsf.to_excel(self.path, "test1", merge_cells=merge_cells) + reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(self.tsframe, recons) - # GH4133 - excel output format strings def test_excel_date_datetime_format(self, merge_cells, engine, ext): + # see gh-4133 + # + # Excel output format strings df = DataFrame([[date(2014, 1, 31), date(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)]], - index=['DATE', 'DATETIME'], columns=['X', 'Y']) + index=["DATE", "DATETIME"], columns=["X", "Y"]) df_expected = DataFrame([[datetime(2014, 1, 31), datetime(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)]], - index=['DATE', 'DATETIME'], columns=['X', 'Y']) + index=["DATE", "DATETIME"], columns=["X", "Y"]) with ensure_clean(ext) as filename2: writer1 = ExcelWriter(self.path) writer2 = ExcelWriter(filename2, - date_format='DD.MM.YYYY', - datetime_format='DD.MM.YYYY HH-MM-SS') + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS") - df.to_excel(writer1, 'test1') - df.to_excel(writer2, 'test1') + df.to_excel(writer1, "test1") + df.to_excel(writer2, "test1") writer1.close() writer2.close() @@ -1391,54 +1460,66 @@ def test_excel_date_datetime_format(self, merge_cells, engine, ext): reader1 = ExcelFile(self.path) reader2 = ExcelFile(filename2) - rs1 = read_excel(reader1, 'test1', index_col=None) - rs2 = read_excel(reader2, 'test1', index_col=None) + rs1 = read_excel(reader1, "test1", index_col=0) + rs2 = read_excel(reader2, "test1", index_col=0) tm.assert_frame_equal(rs1, rs2) - # since the reader returns a datetime object for dates, we need - # to use df_expected to check the result + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self, merge_cells, engine, ext): - # GH19242 - test writing Interval without labels + def test_to_excel_interval_no_labels(self, *_): + # see gh-19242 + # + # Test writing Interval without labels. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = frame.copy() - frame['new'] = pd.cut(frame[0], 10) - expected['new'] = pd.cut(expected[0], 10).astype(str) - frame.to_excel(self.path, 'test1') + + frame["new"] = pd.cut(frame[0], 10) + expected["new"] = pd.cut(expected[0], 10).astype(str) + + frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_interval_labels(self, merge_cells, engine, ext): - # GH19242 - test writing Interval with labels + def test_to_excel_interval_labels(self, *_): + # see gh-19242 + # + # Test writing Interval with labels. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = frame.copy() - intervals = pd.cut(frame[0], 10, labels=['A', 'B', 'C', 'D', 'E', - 'F', 'G', 'H', 'I', 'J']) - frame['new'] = intervals - expected['new'] = pd.Series(list(intervals)) - frame.to_excel(self.path, 'test1') + intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E", + "F", "G", "H", "I", "J"]) + frame["new"] = intervals + expected["new"] = pd.Series(list(intervals)) + + frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_timedelta(self, merge_cells, engine, ext): - # GH 19242, GH9155 - test writing timedelta to xls + def test_to_excel_timedelta(self, *_): + # see gh-19242, gh-9155 + # + # Test writing timedelta to xls. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=['A'], - dtype=np.int64 - ) + columns=["A"], dtype=np.int64) expected = frame.copy() - frame['new'] = frame['A'].apply(lambda x: timedelta(seconds=x)) - expected['new'] = expected['A'].apply( + + frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x)) + expected["new"] = expected["A"].apply( lambda x: timedelta(seconds=x).total_seconds() / float(86400)) - frame.to_excel(self.path, 'test1') + + frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1') + + recons = read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, merge_cells, engine, ext): @@ -1543,53 +1624,54 @@ def test_to_excel_multiindex_no_write_index(self, merge_cells, engine, # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) - def test_to_excel_float_format(self, merge_cells, engine, ext): + def test_to_excel_float_format(self, *_): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - df.to_excel(self.path, 'test1', float_format='%.2f') + index=["A", "B"], columns=["X", "Y", "Z"]) + df.to_excel(self.path, "test1", float_format="%.2f") reader = ExcelFile(self.path) - rs = read_excel(reader, 'test1', index_col=None) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - tm.assert_frame_equal(rs, xp) + result = read_excel(reader, "test1", index_col=0) + + expected = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=["A", "B"], columns=["X", "Y", "Z"]) + tm.assert_frame_equal(result, expected) def test_to_excel_output_encoding(self, merge_cells, engine, ext): - # avoid mixed inferred_type - df = DataFrame([[u'\u0192', u'\u0193', u'\u0194'], - [u'\u0195', u'\u0196', u'\u0197']], - index=[u'A\u0192', u'B'], - columns=[u'X\u0193', u'Y', u'Z']) - - with ensure_clean('__tmp_to_excel_float_format__.' + ext) as filename: - df.to_excel(filename, sheet_name='TestSheet', encoding='utf8') - result = read_excel(filename, 'TestSheet', encoding='utf8') + # Avoid mixed inferred_type. + df = DataFrame([[u"\u0192", u"\u0193", u"\u0194"], + [u"\u0195", u"\u0196", u"\u0197"]], + index=[u"A\u0192", u"B"], + columns=[u"X\u0193", u"Y", u"Z"]) + + with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: + df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") + result = read_excel(filename, "TestSheet", + encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, merge_cells, engine, ext): - with ensure_clean(u('\u0192u.') + ext) as filename: + with ensure_clean(u("\u0192u.") + ext) as filename: try: - f = open(filename, 'wb') + f = open(filename, "wb") except UnicodeEncodeError: - pytest.skip('no unicode file names on this system') + pytest.skip("No unicode file names on this system") else: f.close() df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - - df.to_excel(filename, 'test1', float_format='%.2f') + index=["A", "B"], columns=["X", "Y", "Z"]) + df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) - rs = read_excel(reader, 'test1', index_col=None) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) - tm.assert_frame_equal(rs, xp) + result = read_excel(reader, "test1", index_col=0) + + expected = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=["A", "B"], columns=["X", "Y", "Z"]) + tm.assert_frame_equal(result, expected) # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext): @@ -1691,106 +1773,83 @@ def test_to_excel_unicode_filename(self, merge_cells, engine, ext): # assert ws.cell(maddr).merged # os.remove(filename) - def test_excel_010_hemstring(self, merge_cells, engine, ext): - if merge_cells: - pytest.skip('Skip tests for merged MI format.') + @pytest.mark.parametrize("use_headers", [True, False]) + @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) + @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) + def test_excel_010_hemstring(self, merge_cells, engine, ext, + c_idx_nlevels, r_idx_nlevels, use_headers): - from pandas.util.testing import makeCustomDataframe as mkdf - # ensure limited functionality in 0.10 - # override of #2370 until sorted out in 0.11 + def roundtrip(data, header=True, parser_hdr=0, index=True): + data.to_excel(self.path, header=header, + merge_cells=merge_cells, index=index) - def roundtrip(df, header=True, parser_hdr=0, index=True): - - df.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) xf = ExcelFile(self.path) - res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) - return res - - nrows = 5 - ncols = 3 - for use_headers in (True, False): - for i in range(1, 4): # row multindex up to nlevel=3 - for j in range(1, 4): # col "" - df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - - # this if will be removed once multi column excel writing - # is implemented for now fixing #9794 - if j > 1: - with pytest.raises(NotImplementedError): - res = roundtrip(df, use_headers, index=False) - else: - res = roundtrip(df, use_headers) - - if use_headers: - assert res.shape == (nrows, ncols + i) - else: - # first row taken as columns - assert res.shape == (nrows - 1, ncols + i) + return read_excel(xf, xf.sheet_names[0], header=parser_hdr) - # no nans - for r in range(len(res.index)): - for c in range(len(res.columns)): - assert res.iloc[r, c] is not np.nan + # Basic test. + parser_header = 0 if use_headers else None + res = roundtrip(DataFrame([0]), use_headers, parser_header) - res = roundtrip(DataFrame([0])) - assert res.shape == (1, 1) - assert res.iloc[0, 0] is not np.nan - - res = roundtrip(DataFrame([0]), False, None) assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan - def test_excel_010_hemstring_raises_NotImplementedError(self, merge_cells, - engine, ext): - # This test was failing only for j>1 and header=False, - # So I reproduced a simple test. - if merge_cells: - pytest.skip('Skip tests for merged MI format.') + # More complex tests with multi-index. + nrows = 5 + ncols = 3 from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 - # override of #2370 until sorted out in 0.11 + # override of gh-2370 until sorted out in 0.11 - def roundtrip2(df, header=True, parser_hdr=0, index=True): + df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, + c_idx_nlevels=c_idx_nlevels) - df.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) - xf = ExcelFile(self.path) - res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) - return res + # This if will be removed once multi-column Excel writing + # is implemented. For now fixing gh-9794. + if c_idx_nlevels > 1: + with pytest.raises(NotImplementedError): + roundtrip(df, use_headers, index=False) + else: + res = roundtrip(df, use_headers) - nrows = 5 - ncols = 3 - j = 2 - i = 1 - df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - with pytest.raises(NotImplementedError): - roundtrip2(df, header=False, index=False) + if use_headers: + assert res.shape == (nrows, ncols + r_idx_nlevels) + else: + # First row taken as columns. + assert res.shape == (nrows - 1, ncols + r_idx_nlevels) + + # No NaNs. + for r in range(len(res.index)): + for c in range(len(res.columns)): + assert res.iloc[r, c] is not np.nan - def test_duplicated_columns(self, merge_cells, engine, ext): - # Test for issue #5235 + def test_duplicated_columns(self, *_): + # see gh-5235 write_frame = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) - colnames = ['A', 'B', 'B'] + col_names = ["A", "B", "B"] - write_frame.columns = colnames - write_frame.to_excel(self.path, 'test1') + write_frame.columns = col_names + write_frame.to_excel(self.path, "test1") + + read_frame = read_excel(self.path, "test1", index_col=0) + read_frame.columns = col_names - read_frame = read_excel(self.path, 'test1') - read_frame.columns = colnames tm.assert_frame_equal(write_frame, read_frame) - # 11007 / #10970 + # see gh-11007, gh-10970 write_frame = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'A', 'B']) - write_frame.to_excel(self.path, 'test1') - read_frame = read_excel(self.path, 'test1') - read_frame.columns = ['A', 'B', 'A', 'B'] + columns=["A", "B", "A", "B"]) + write_frame.to_excel(self.path, "test1") + + read_frame = read_excel(self.path, "test1", index_col=0) + read_frame.columns = ["A", "B", "A", "B"] + tm.assert_frame_equal(write_frame, read_frame) - # 10982 - write_frame.to_excel(self.path, 'test1', index=False, header=False) - read_frame = read_excel(self.path, 'test1', header=None) + # see gh-10982 + write_frame.to_excel(self.path, "test1", index=False, header=False) + read_frame = read_excel(self.path, "test1", header=None) + write_frame.columns = [0, 1, 2, 3] tm.assert_frame_equal(write_frame, read_frame) @@ -1805,36 +1864,40 @@ def test_swapped_columns(self, merge_cells, engine, ext): tm.assert_series_equal(write_frame['A'], read_frame['A']) tm.assert_series_equal(write_frame['B'], read_frame['B']) - def test_invalid_columns(self, merge_cells, engine, ext): - # 10982 - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) + def test_invalid_columns(self, *_): + # see gh-10982 + write_frame = DataFrame({"A": [1, 1, 1], + "B": [2, 2, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - write_frame.to_excel(self.path, 'test1', columns=['B', 'C']) - expected = write_frame.reindex(columns=['B', 'C']) - read_frame = read_excel(self.path, 'test1') + write_frame.to_excel(self.path, "test1", columns=["B", "C"]) + + expected = write_frame.reindex(columns=["B", "C"]) + read_frame = read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(expected, read_frame) with pytest.raises(KeyError): - write_frame.to_excel(self.path, 'test1', columns=['C', 'D']) + write_frame.to_excel(self.path, "test1", columns=["C", "D"]) - def test_comment_arg(self, merge_cells, engine, ext): - # Re issue #18735 - # Test the comment argument functionality to read_excel + def test_comment_arg(self, *_): + # see gh-18735 + # + # Test the comment argument functionality to read_excel. - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], + "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") + + # Read file without comment arg. + result1 = read_excel(self.path, "test_c", index_col=0) - # Read file without comment arg - result1 = read_excel(self.path, 'test_c') result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None - result2 = read_excel(self.path, 'test_c', comment='#') + + result2 = read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result1, result2) def test_comment_default(self, merge_cells, engine, ext): @@ -1851,22 +1914,23 @@ def test_comment_default(self, merge_cells, engine, ext): result2 = read_excel(self.path, 'test_c', comment=None) tm.assert_frame_equal(result1, result2) - def test_comment_used(self, merge_cells, engine, ext): - # Re issue #18735 - # Test the comment argument is working as expected when used + def test_comment_used(self, *_): + # see gh-18735 + # + # Test the comment argument is working as expected when used. - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], + "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") - # Test read_frame_comment against manually produced expected output - expected = DataFrame({'A': ['one', None, 'one'], - 'B': ['two', None, None]}) - result = read_excel(self.path, 'test_c', comment='#') + # Test read_frame_comment against manually produced expected output. + expected = DataFrame({"A": ["one", None, "one"], + "B": ["two", None, None]}) + result = read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) - def test_comment_emptyline(self, merge_cells, engine, ext): + def test_comment_empty_line(self, merge_cells, engine, ext): # Re issue #18735 # Test that read_excel ignores commented lines at the end of file @@ -1899,64 +1963,69 @@ def test_datetimes(self, merge_cells, engine, ext): tm.assert_series_equal(write_frame['A'], read_frame['A']) - # GH7074 def test_bytes_io(self, merge_cells, engine, ext): + # see gh-7074 bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) - # pass engine explicitly as there is no file path to infer from + + # Pass engine explicitly, as there is no file path to infer from. writer = ExcelWriter(bio, engine=engine) df.to_excel(writer) writer.save() + bio.seek(0) - reread_df = read_excel(bio) + reread_df = read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) - # GH8188 - def test_write_lists_dict(self, merge_cells, engine, ext): - df = DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}], - 'numeric': [1, 2, 3.0], - 'str': ['apple', 'banana', 'cherry']}) + def test_write_lists_dict(self, *_): + # see gh-8188. + df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"]}) + df.to_excel(self.path, "Sheet1") + read = read_excel(self.path, "Sheet1", header=0, index_col=0) + expected = df.copy() expected.mixed = expected.mixed.apply(str) - expected.numeric = expected.numeric.astype('int64') + expected.numeric = expected.numeric.astype("int64") - df.to_excel(self.path, 'Sheet1') - read = read_excel(self.path, 'Sheet1', header=0) tm.assert_frame_equal(read, expected) - # GH13347 - def test_true_and_false_value_options(self, merge_cells, engine, ext): - df = pd.DataFrame([['foo', 'bar']], columns=['col1', 'col2']) - expected = df.replace({'foo': True, - 'bar': False}) + def test_true_and_false_value_options(self, *_): + # see gh-13347 + df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) + expected = df.replace({"foo": True, "bar": False}) df.to_excel(self.path) - read_frame = read_excel(self.path, true_values=['foo'], - false_values=['bar']) + read_frame = read_excel(self.path, true_values=["foo"], + false_values=["bar"], index_col=0) tm.assert_frame_equal(read_frame, expected) - def test_freeze_panes(self, merge_cells, engine, ext): - # GH15160 - expected = DataFrame([[1, 2], [3, 4]], columns=['col1', 'col2']) + def test_freeze_panes(self, *_): + # see gh-15160 + expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) - result = read_excel(self.path) - tm.assert_frame_equal(expected, result) - def test_path_pathlib(self, merge_cells, engine, ext): + result = read_excel(self.path, index_col=0) + tm.assert_frame_equal(result, expected) + + def test_path_path_lib(self, merge_cells, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) - reader = partial(pd.read_excel) + + reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, - path="foo.{}".format(ext)) - tm.assert_frame_equal(df, result) + path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) - def test_path_localpath(self, merge_cells, engine, ext): + def test_path_local_path(self, merge_cells, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) - reader = partial(pd.read_excel) + + reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, - path="foo.{}".format(ext)) - tm.assert_frame_equal(df, result) + path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) @td.skip_if_no('openpyxl')