From 3827e31a3d3e57534ed8fd1ea0847ac09297fd6b Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 1 Nov 2016 12:38:33 -0400 Subject: [PATCH] Unify column names in dd.read_csv Perhaps we should do this somewhere more globally? Perhaps we shouldn't do this at all? --- dask/dataframe/io/csv.py | 2 ++ dask/dataframe/io/tests/test_csv.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/dask/dataframe/io/csv.py b/dask/dataframe/io/csv.py index e45b4e7df09..b05c76ca5a3 100644 --- a/dask/dataframe/io/csv.py +++ b/dask/dataframe/io/csv.py @@ -56,6 +56,8 @@ def pandas_read_text(reader, b, header, kwargs, dtypes=None, columns=None, if enforce and columns and (list(df.columns) != list(columns)): raise ValueError("Columns do not match", df.columns, columns) + elif columns: + df.columns = columns return df diff --git a/dask/dataframe/io/tests/test_csv.py b/dask/dataframe/io/tests/test_csv.py index d053c1f7fbb..54e1b0c6f4e 100644 --- a/dask/dataframe/io/tests/test_csv.py +++ b/dask/dataframe/io/tests/test_csv.py @@ -667,6 +667,17 @@ def test_read_csv_singleton_dtype(): dd.read_csv(fn, dtype=float)) +def test_robust_column_mismatch(): + files = csv_files.copy() + k = sorted(files)[-1] + files[k] = files[k].replace(b'name', b'Name') + with filetexts(files, mode='b'): + ddf = dd.read_csv('2014-01-*.csv') + df = pd.read_csv('2014-01-01.csv') + assert (df.columns == ddf.columns).all() + assert_eq(ddf, ddf) + + ############ # to_csv # ############