BUG: Avoid flaky usecols set in C engine

Closes pandas-devgh-14792.
forking-repos · Dec 27, 2016 · 82cf55b · 82cf55b
1 parent 7f0eefc
commit 82cf55b
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 12 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -321,3 +321,4 @@ Bug Fixes
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
 - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
 - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
+- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -987,24 +987,42 @@ def _evaluate_usecols(usecols, names):
 
 def _validate_usecols_arg(usecols):
     """
-    Check whether or not the 'usecols' parameter
-    contains all integers (column selection by index),
-    strings (column by name) or is a callable. Raises
-    a ValueError if that is not the case.
+    Validate the 'usecols' parameter.
+
+    Checks whether or not the 'usecols' parameter contains all integers
+    (column selection by index), strings (column by name) or is a callable.
+    Raises a ValueError if that is not the case.
+
+    Parameters
+    ----------
+    usecols : array-like, callable, or None
+        List of columns to use when parsing or a callable that can be used
+        to filter a list of table columns.
+
+    Returns
+    -------
+    usecols_tuple : tuple
+        A tuple of (verified_usecols, usecols_dtype).
+
+        'verified_usecols' is either a set if an array-like is passed in or
+        'usecols' if a callable or None is passed in.
+
+        'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
+        is passed in or None if a callable or None is passed in.
     """
     msg = ("'usecols' must either be all strings, all unicode, "
            "all integers or a callable")
 
     if usecols is not None:
         if callable(usecols):
-            return usecols
+            return usecols, None
         usecols_dtype = lib.infer_dtype(usecols)
         if usecols_dtype not in ('empty', 'integer',
                                  'string', 'unicode'):
             raise ValueError(msg)
 
-        return set(usecols)
-    return usecols
+        return set(usecols), usecols_dtype
+    return usecols, None
 
 
 def _validate_parse_dates_arg(parse_dates):
@@ -1473,7 +1491,8 @@ def __init__(self, src, **kwds):
         self._reader = _parser.TextReader(src, **kwds)
 
         # XXX
-        self.usecols = _validate_usecols_arg(self._reader.usecols)
+        self.usecols, self.usecols_dtype = _validate_usecols_arg(
+            self._reader.usecols)
 
         passed_names = self.names is None
 
@@ -1549,12 +1568,29 @@ def close(self):
             pass
 
     def _set_noconvert_columns(self):
+        """
+        Set the columns that should not undergo dtype conversions.
+
+        Currently, any column that is involved with date parsing will not
+        undergo such conversions.
+        """
         names = self.orig_names
-        usecols = self.usecols
+        if self.usecols_dtype == 'integer':
+            # A set of integers will be converted to a list in
+            # the correct order every single time.
+            usecols = list(self.usecols)
+        elif (callable(self.usecols) or
+                self.usecols_dtype not in ('empty', None)):
+            # The names attribute should have the correct columns
+            # in the proper order for indexing with parse_dates.
+            usecols = self.names[:]
+        else:
+            # Usecols is empty.
+            usecols = None
 
         def _set(x):
-            if usecols and is_integer(x):
-                x = list(usecols)[x]
+            if usecols is not None and is_integer(x):
+                x = usecols[x]
 
             if not is_integer(x):
                 x = names.index(x)
@@ -1792,7 +1828,7 @@ def __init__(self, f, **kwds):
         self.skipinitialspace = kwds['skipinitialspace']
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
-        self.usecols = _validate_usecols_arg(kwds['usecols'])
+        self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
         self.names_passed = kwds['names'] or None

diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py
@@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self):
                            parse_dates=parse_dates)
         tm.assert_frame_equal(df, expected)
 
+        # See gh-14792
+        s = """a,b,c,d,e,f,g,h,i,j
+        2016/09/21,1,1,2,3,4,5,6,7,8"""
+        parse_dates = [0]
+        usecols = list('abcdefghij')
+        cols = {'a': Timestamp('2016-09-21'),
+                'b': [1], 'c': [1], 'd': [2],
+                'e': [3], 'f': [4], 'g': [5],
+                'h': [6], 'i': [7], 'j': [8]}
+        expected = DataFrame(cols, columns=usecols)
+        df = self.read_csv(StringIO(s), usecols=usecols,
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
+        s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
+        parse_dates = [[0, 1]]
+        usecols = list('abcdefghij')
+        cols = {'a_b': '2016/09/21 1',
+                'c': [1], 'd': [2], 'e': [3], 'f': [4],
+                'g': [5], 'h': [6], 'i': [7], 'j': [8]}
+        expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
+        df = self.read_csv(StringIO(s), usecols=usecols,
+                           parse_dates=parse_dates)
+        tm.assert_frame_equal(df, expected)
+
     def test_usecols_with_parse_dates_and_full_names(self):
         # See gh-9755
         s = """0,1,20140101,0900,4