Skip to content

Commit

Permalink
BUG: Avoid flaky usecols set in C engine
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Dec 25, 2016
1 parent aba7d25 commit 30bae02
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,4 @@ Bug Fixes
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
28 changes: 20 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,14 +997,14 @@ def _validate_usecols_arg(usecols):

if usecols is not None:
if callable(usecols):
return usecols
return usecols, None
usecols_dtype = lib.infer_dtype(usecols)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
raise ValueError(msg)

return set(usecols)
return usecols
return set(usecols), usecols_dtype
return usecols, None


def _validate_parse_dates_arg(parse_dates):
Expand Down Expand Up @@ -1473,7 +1473,8 @@ def __init__(self, src, **kwds):
self._reader = _parser.TextReader(src, **kwds)

# XXX
self.usecols = _validate_usecols_arg(self._reader.usecols)
self.usecols, self.usecols_dtype = _validate_usecols_arg(
self._reader.usecols)

passed_names = self.names is None

Expand Down Expand Up @@ -1550,11 +1551,22 @@ def close(self):

def _set_noconvert_columns(self):
names = self.orig_names
usecols = self.usecols
if self.usecols_dtype == 'integer':
# A set of integers will be converted to a list in
# the correct order every single time.
usecols = list(self.usecols)
elif (callable(self.usecols) or
self.usecols_dtype not in ('empty', None)):
# The names attribute should have the correct columns
# in the proper order for indexing with parse_dates.
usecols = self.names[:]
else:
# Usecols is empty.
usecols = None

def _set(x):
if usecols and is_integer(x):
x = list(usecols)[x]
if usecols is not None and is_integer(x):
x = usecols[x]

if not is_integer(x):
x = names.index(x)
Expand Down Expand Up @@ -1792,7 +1804,7 @@ def __init__(self, f, **kwds):
self.skipinitialspace = kwds['skipinitialspace']
self.lineterminator = kwds['lineterminator']
self.quoting = kwds['quoting']
self.usecols = _validate_usecols_arg(kwds['usecols'])
self.usecols, _ = _validate_usecols_arg(kwds['usecols'])
self.skip_blank_lines = kwds['skip_blank_lines']

self.names_passed = kwds['names'] or None
Expand Down
25 changes: 25 additions & 0 deletions pandas/io/tests/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,31 @@ def test_usecols_with_parse_dates(self):
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

# See gh-14792
s = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [0]
usecols = list('abcdefghij')
cols = {'a': Timestamp('2016-09-21'),
'b': [1], 'c': [1], 'd': [2],
'e': [3], 'f': [4], 'g': [5],
'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=usecols)
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [[0, 1]]
usecols = list('abcdefghij')
cols = {'a_b': '2016/09/21 1',
'c': [1], 'd': [2], 'e': [3], 'f': [4],
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)

def test_usecols_with_parse_dates_and_full_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
Expand Down

0 comments on commit 30bae02

Please sign in to comment.