Skip to content

Commit

Permalink
BUG: Avoid flaky usecols set in C engine
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Dec 27, 2016
1 parent 7f0eefc commit 82cf55b
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 12 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,4 @@ Bug Fixes
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
60 changes: 48 additions & 12 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,24 +987,42 @@ def _evaluate_usecols(usecols, names):

def _validate_usecols_arg(usecols):
"""
Check whether or not the 'usecols' parameter
contains all integers (column selection by index),
strings (column by name) or is a callable. Raises
a ValueError if that is not the case.
Validate the 'usecols' parameter.
Checks whether or not the 'usecols' parameter contains all integers
(column selection by index), strings (column by name) or is a callable.
Raises a ValueError if that is not the case.
Parameters
----------
usecols : array-like, callable, or None
List of columns to use when parsing or a callable that can be used
to filter a list of table columns.
Returns
-------
usecols_tuple : tuple
A tuple of (verified_usecols, usecols_dtype).
'verified_usecols' is either a set if an array-like is passed in or
'usecols' if a callable or None is passed in.
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
is passed in or None if a callable or None is passed in.
"""
msg = ("'usecols' must either be all strings, all unicode, "
"all integers or a callable")

if usecols is not None:
if callable(usecols):
return usecols
return usecols, None
usecols_dtype = lib.infer_dtype(usecols)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
raise ValueError(msg)

return set(usecols)
return usecols
return set(usecols), usecols_dtype
return usecols, None


def _validate_parse_dates_arg(parse_dates):
Expand Down Expand Up @@ -1473,7 +1491,8 @@ def __init__(self, src, **kwds):
self._reader = _parser.TextReader(src, **kwds)

# XXX
self.usecols = _validate_usecols_arg(self._reader.usecols)
self.usecols, self.usecols_dtype = _validate_usecols_arg(
self._reader.usecols)

passed_names = self.names is None

Expand Down Expand Up @@ -1549,12 +1568,29 @@ def close(self):
pass

def _set_noconvert_columns(self):
"""
Set the columns that should not undergo dtype conversions.
Currently, any column that is involved with date parsing will not
undergo such conversions.
"""
names = self.orig_names
usecols = self.usecols
if self.usecols_dtype == 'integer':
# A set of integers will be converted to a list in
# the correct order every single time.
usecols = list(self.usecols)
elif (callable(self.usecols) or
self.usecols_dtype not in ('empty', None)):
# The names attribute should have the correct columns
# in the proper order for indexing with parse_dates.
usecols = self.names[:]
else:
# Usecols is empty.
usecols = None

def _set(x):
if usecols and is_integer(x):
x = list(usecols)[x]
if usecols is not None and is_integer(x):
x = usecols[x]

if not is_integer(x):
x = names.index(x)
Expand Down Expand Up @@ -1792,7 +1828,7 @@ def __init__(self, f, **kwds):
self.skipinitialspace = kwds['skipinitialspace']
self.lineterminator = kwds['lineterminator']
self.quoting = kwds['quoting']
self.usecols = _validate_usecols_arg(kwds['usecols'])
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
self.skip_blank_lines = kwds['skip_blank_lines']

self.names_passed = kwds['names'] or None
Expand Down
25 changes: 25 additions & 0 deletions pandas/io/tests/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self):
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

# See gh-14792
s = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [0]
usecols = list('abcdefghij')
cols = {'a': Timestamp('2016-09-21'),
'b': [1], 'c': [1], 'd': [2],
'e': [3], 'f': [4], 'g': [5],
'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=usecols)
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [[0, 1]]
usecols = list('abcdefghij')
cols = {'a_b': '2016/09/21 1',
'c': [1], 'd': [2], 'e': [3], 'f': [4],
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

def test_usecols_with_parse_dates_and_full_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
Expand Down

0 comments on commit 82cf55b

Please sign in to comment.