Skip to content

Commit

Permalink
BUG: Convert uint64 in maybe_convert_numeric
Browse files Browse the repository at this point in the history
Add handling for uint64 elements in an array
with the follow behavior specifications:

1) If uint64 and NaN are both detected, the
original input will be returned if coerce_numeric
is False. Otherwise, an Exception is raised.

2) If uint64 and negative numbers are both
detected, the origin input be returned if
coerce_numeric is False. Otherwise, an
Exception is raisd.

Closes pandas-devgh-14982.
  • Loading branch information
gfyoung committed Dec 28, 2016
1 parent 3e3434b commit 9cde6a6
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 23 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Expand Up @@ -319,5 +319,5 @@ Bug Fixes


- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
23 changes: 18 additions & 5 deletions pandas/io/tests/parser/common.py
Expand Up @@ -944,26 +944,39 @@ def test_int64_overflow(self):
00013007854817840017963235
00013007854817840018860166"""

# 13007854817840016671868 > UINT64_MAX, so this
# will overflow and return object as the dtype.
result = self.read_csv(StringIO(data))
self.assertTrue(result['ID'].dtype == object)

self.assertRaises(OverflowError, self.read_csv,
StringIO(data), converters={'ID': np.int64})
# 13007854817840016671868 > UINT64_MAX, so attempts
# to cast to either int64 or uint64 will result in
# an OverflowError being raised.
for conv in (np.int64, np.uint64):
self.assertRaises(OverflowError, self.read_csv,
StringIO(data), converters={'ID': conv})

# Just inside int64 range: parse as integer
# These numbers fall right inside the int64 range,
# so they should be parsed as string.
i_max = np.iinfo(np.int64).max
i_min = np.iinfo(np.int64).min

for x in [i_max, i_min]:
result = self.read_csv(StringIO(str(x)), header=None)
expected = DataFrame([x])
tm.assert_frame_equal(result, expected)

# Just outside int64 range: parse as string
# These numbers fall just outside the int64 range,
# so they should be parsed as string.
too_big = i_max + 1
too_small = i_min - 1

for x in [too_big, too_small]:
result = self.read_csv(StringIO(str(x)), header=None)
expected = DataFrame([str(x)])
if self.engine == 'python' and x == too_big:
expected = DataFrame([x])
else:
expected = DataFrame([str(x)])
tm.assert_frame_equal(result, expected)

def test_empty_with_nrows_chunksize(self):
Expand Down
176 changes: 159 additions & 17 deletions pandas/src/inference.pyx
Expand Up @@ -13,9 +13,6 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,

# core.common import for fast inference checks

npy_int64_max = np.iinfo(np.int64).max


cpdef bint is_float(object obj):
return util.is_float_object(obj)

Expand Down Expand Up @@ -629,48 +626,157 @@ cdef extern from "parse_helper.h":

cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
cdef uint64_t iUINT64_MAX = <uint64_t> UINT64_MAX


def maybe_convert_numeric(object[:] values, set na_values,
def maybe_convert_numeric(ndarray[object] values, set na_values,
bint convert_empty=True, bint coerce_numeric=False):
"""
Type inference function-- convert strings to numeric (potentially) and
convert to proper dtype array
Convert object array to a numeric array if possible.
Parameters
----------
values : ndarray
Array of object elements to convert.
na_values : set
Set of values that should be interpreted as NaN.
convert_empty : bool, default True
If an empty array-like object is encountered, whether to interpret
that element as NaN or not. If set to False, a ValueError will be
raised if such an element is encountered and 'coerce_numeric' is False.
coerce_numeric : bool, default False
If initial attempts to convert to numeric have failed, whether to
force conversion to numeric via alternative methods or by setting the
element to NaN. Otherwise, an Exception will be raised when such an
element is encountered.
This boolean also has an impact on how conversion behaves when a
numeric array has no suitable numerical dtype to return (i.e. uint64,
int32, uint8). If set to False, the original object array will be
returned. Otherwise, a ValueError will be raised.
Returns
-------
numeric_array : array of converted object values to numerical ones
"""
cdef:
int status, maybe_int
Py_ssize_t i, n = values.size
ndarray[float64_t] floats = np.empty(n, dtype='f8')
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
ndarray[int64_t] ints = np.empty(n, dtype='i8')
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
bint seen_null = False
bint seen_uint = False
bint seen_sint = False
bint seen_float = False
bint seen_complex = False
bint seen_int = False
bint seen_bool = False
object val
float64_t fval


def check_uint64_nan():
"""
Check whether we have encountered uint64 when handling a NaN element.
If uint64 has been encountered, we cannot safely cast to float64 due
to truncation problems (this would occur if we return a numeric array
containing a NaN element).
Returns
-------
return_values : bool
Whether or not we should return the original input array to avoid
data truncation.
"""
if seen_null and seen_uint:
if not coerce_numeric:
return True
else:
raise ValueError("uint64 array detected, and such an "
"array cannot contain NaN.")

return False


def check_uint64_int64_conflict():
"""
Check whether we have encountered both int64 and uint64 elements.
If bot have been encountered, we cannot safely cast to an integer
dtype since none is large enough to hold both types of elements.
Returns
-------
return_values : bool
Whether or not we should return the original input array to avoid
data truncation.
"""
if seen_sint and seen_uint:
if not coerce_numeric:
return True
else:
raise ValueError("uint64 and negative values detected. "
"Cannot safely return a numeric array "
"without truncating data.")

return False

for i in range(n):
val = values[i]

if val.__hash__ is not None and val in na_values:
seen_null = True
if check_uint64_nan():
return values

floats[i] = complexes[i] = nan
seen_float = True
elif util.is_float_object(val):
if val != val:
seen_null = True
if check_uint64_nan():
return values

floats[i] = complexes[i] = val
seen_float = True
elif util.is_integer_object(val):
floats[i] = ints[i] = val
floats[i] = complexes[i] = val
as_int = int(val)
seen_int = True

seen_uint = seen_uint or (as_int > iINT64_MAX)
seen_sint = seen_sint or (as_int < 0)

if check_uint64_nan() or check_uint64_int64_conflict():
return values

if seen_uint:
uints[i] = as_int
elif seen_sint:
ints[i] = as_int
else:
uints[i] = as_int
ints[i] = as_int
elif util.is_bool_object(val):
floats[i] = ints[i] = bools[i] = val
floats[i] = uints[i] = ints[i] = bools[i] = val
seen_bool = True
elif val is None:
seen_null = True
if check_uint64_nan():
return values

floats[i] = complexes[i] = nan
seen_float = True
elif hasattr(val, '__len__') and len(val) == 0:
if convert_empty or coerce_numeric:
seen_null = True
if check_uint64_nan():
return values

floats[i] = complexes[i] = nan
seen_float = True
else:
Expand All @@ -686,24 +792,55 @@ def maybe_convert_numeric(object[:] values, set na_values,
status = floatify(val, &fval, &maybe_int)

if fval in na_values:
seen_null = True
if check_uint64_nan():
return values

floats[i] = complexes[i] = nan
seen_float = True
else:
if fval != fval:
seen_null = True
if check_uint64_nan():
return values

floats[i] = fval

if not seen_float:
if maybe_int:
as_int = int(val)
if maybe_int:
as_int = int(val)

if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
if as_int in na_values:
seen_float = True
seen_null = True
else:
seen_uint = seen_uint or (as_int > iINT64_MAX)
seen_sint = seen_sint or (as_int < 0)
seen_int = True

if check_uint64_nan() or check_uint64_int64_conflict():
return values

if not (seen_float or as_int in na_values):
if as_int < iINT64_MIN or as_int > iUINT64_MAX:
raise ValueError('Integer out of range.')

if seen_uint:
uints[i] = as_int
elif seen_sint:
ints[i] = as_int
else:
raise ValueError('integer out of range')
else:
seen_float = True
uints[i] = as_int
ints[i] = as_int
else:
seen_float = True
except (TypeError, ValueError) as e:
if not coerce_numeric:
raise type(e)(str(e) + ' at position {}'.format(i))
elif "uint64" in str(e): # Exception from check functions.
raise
seen_null = True
if check_uint64_nan():
return values

floats[i] = nan
seen_float = True
Expand All @@ -713,9 +850,14 @@ def maybe_convert_numeric(object[:] values, set na_values,
elif seen_float:
return floats
elif seen_int:
return ints
if seen_uint:
return uints
else:
return ints
elif seen_bool:
return bools.view(np.bool_)
elif seen_uint:
return uints
return ints


Expand Down Expand Up @@ -810,7 +952,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
floats[i] = <float64_t> val
complexes[i] = <double complex> val
if not seen_null:
seen_uint = seen_uint or (int(val) > npy_int64_max)
seen_uint = seen_uint or (int(val) > iINT64_MAX)
seen_sint = seen_sint or (val < 0)

if seen_uint and seen_sint:
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/types/test_inference.py
Expand Up @@ -255,6 +255,59 @@ def test_convert_non_hashable(self):
result = lib.maybe_convert_numeric(arr, set(), False, True)
tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan]))

def test_convert_numeric_uint64(self):
arr = np.array([2**63], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)

arr = np.array([str(2**63)], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)

arr = np.array([np.uint64(2**63)], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)

def test_convert_numeric_uint64_nan(self):
msg = 'uint64 array detected'
cases = [(np.array([2**63, np.nan], dtype=object), set()),
(np.array([str(2**63), np.nan], dtype=object), set()),
(np.array([np.nan, 2**63], dtype=object), set()),
(np.array([np.nan, str(2**63)], dtype=object), set()),
(np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
(np.array([str(2**63), str(2**63 + 1)],
dtype=object), set([2**63]))]

for coerce in (True, False):
for arr, na_values in cases:
if coerce:
with tm.assertRaisesRegexp(ValueError, msg):
lib.maybe_convert_numeric(arr, na_values,
coerce_numeric=coerce)
else:
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
arr, na_values), arr)

def test_convert_numeric_int64_uint64(self):
msg = 'uint64 and negative values detected'
cases = [np.array([2**63, -1], dtype=object),
np.array([str(2**63), -1], dtype=object),
np.array([str(2**63), str(-1)], dtype=object),
np.array([-1, 2**63], dtype=object),
np.array([-1, str(2**63)], dtype=object),
np.array([str(-1), str(2**63)], dtype=object)]

for coerce in (True, False):
for case in cases:
if coerce:
with tm.assertRaisesRegexp(ValueError, msg):
print(case)
lib.maybe_convert_numeric(case, set(),
coerce_numeric=coerce)
else:
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
case, set()), case)

def test_maybe_convert_objects_uint64(self):
# see gh-4471
arr = np.array([2**63], dtype=object)
Expand Down

0 comments on commit 9cde6a6

Please sign in to comment.