diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0873e4b34b0b13..1ff591c86f6fa7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -319,5 +319,5 @@ Bug Fixes - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) -- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`) +- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index b6d1d4bb09f56f..c6c2a9e954f55e 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -944,26 +944,39 @@ def test_int64_overflow(self): 00013007854817840017963235 00013007854817840018860166""" + # 13007854817840016671868 > UINT64_MAX, so this + # will overflow and return object as the dtype. result = self.read_csv(StringIO(data)) self.assertTrue(result['ID'].dtype == object) - self.assertRaises(OverflowError, self.read_csv, - StringIO(data), converters={'ID': np.int64}) + # 13007854817840016671868 > UINT64_MAX, so attempts + # to cast to either int64 or uint64 will result in + # an OverflowError being raised. + for conv in (np.int64, np.uint64): + self.assertRaises(OverflowError, self.read_csv, + StringIO(data), converters={'ID': conv}) - # Just inside int64 range: parse as integer + # These numbers fall right inside the int64 range, + # so they should be parsed as string. i_max = np.iinfo(np.int64).max i_min = np.iinfo(np.int64).min + for x in [i_max, i_min]: result = self.read_csv(StringIO(str(x)), header=None) expected = DataFrame([x]) tm.assert_frame_equal(result, expected) - # Just outside int64 range: parse as string + # These numbers fall just outside the int64 range, + # so they should be parsed as string. too_big = i_max + 1 too_small = i_min - 1 + for x in [too_big, too_small]: result = self.read_csv(StringIO(str(x)), header=None) - expected = DataFrame([str(x)]) + if self.engine == 'python' and x == too_big: + expected = DataFrame([x]) + else: + expected = DataFrame([str(x)]) tm.assert_frame_equal(result, expected) def test_empty_with_nrows_chunksize(self): diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index b6b13def193ff7..22aa4cf3044d70 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -13,9 +13,6 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, # core.common import for fast inference checks -npy_int64_max = np.iinfo(np.int64).max - - cpdef bint is_float(object obj): return util.is_float_object(obj) @@ -629,13 +626,38 @@ cdef extern from "parse_helper.h": cdef int64_t iINT64_MAX = INT64_MAX cdef int64_t iINT64_MIN = INT64_MIN +cdef uint64_t iUINT64_MAX = UINT64_MAX -def maybe_convert_numeric(object[:] values, set na_values, +def maybe_convert_numeric(ndarray[object] values, set na_values, bint convert_empty=True, bint coerce_numeric=False): """ - Type inference function-- convert strings to numeric (potentially) and - convert to proper dtype array + Convert object array to a numeric array if possible. + + Parameters + ---------- + values : ndarray + Array of object elements to convert. + na_values : set + Set of values that should be interpreted as NaN. + convert_empty : bool, default True + If an empty array-like object is encountered, whether to interpret + that element as NaN or not. If set to False, a ValueError will be + raised if such an element is encountered and 'coerce_numeric' is False. + coerce_numeric : bool, default False + If initial attempts to convert to numeric have failed, whether to + force conversion to numeric via alternative methods or by setting the + element to NaN. Otherwise, an Exception will be raised when such an + element is encountered. + + This boolean also has an impact on how conversion behaves when a + numeric array has no suitable numerical dtype to return (i.e. uint64, + int32, uint8). If set to False, the original object array will be + returned. Otherwise, a ValueError will be raised. + + Returns + ------- + numeric_array : array of converted object values to numerical ones """ cdef: int status, maybe_int @@ -643,7 +665,11 @@ def maybe_convert_numeric(object[:] values, set na_values, ndarray[float64_t] floats = np.empty(n, dtype='f8') ndarray[complex128_t] complexes = np.empty(n, dtype='c16') ndarray[int64_t] ints = np.empty(n, dtype='i8') + ndarray[uint64_t] uints = np.empty(n, dtype='u8') ndarray[uint8_t] bools = np.empty(n, dtype='u1') + bint seen_null = False + bint seen_uint = False + bint seen_sint = False bint seen_float = False bint seen_complex = False bint seen_int = False @@ -651,26 +677,106 @@ def maybe_convert_numeric(object[:] values, set na_values, object val float64_t fval + + def check_uint64_nan(): + """ + Check whether we have encountered uint64 when handling a NaN element. + + If uint64 has been encountered, we cannot safely cast to float64 due + to truncation problems (this would occur if we return a numeric array + containing a NaN element). + + Returns + ------- + return_values : bool + Whether or not we should return the original input array to avoid + data truncation. + """ + if seen_null and seen_uint: + if not coerce_numeric: + return True + else: + raise ValueError("uint64 array detected, and such an " + "array cannot contain NaN.") + + return False + + + def check_uint64_int64_conflict(): + """ + Check whether we have encountered both int64 and uint64 elements. + + If bot have been encountered, we cannot safely cast to an integer + dtype since none is large enough to hold both types of elements. + + Returns + ------- + return_values : bool + Whether or not we should return the original input array to avoid + data truncation. + """ + if seen_sint and seen_uint: + if not coerce_numeric: + return True + else: + raise ValueError("uint64 and negative values detected. " + "Cannot safely return a numeric array " + "without truncating data.") + + return False + for i in range(n): val = values[i] if val.__hash__ is not None and val in na_values: + seen_null = True + if check_uint64_nan(): + return values + floats[i] = complexes[i] = nan seen_float = True elif util.is_float_object(val): + if val != val: + seen_null = True + if check_uint64_nan(): + return values + floats[i] = complexes[i] = val seen_float = True elif util.is_integer_object(val): - floats[i] = ints[i] = val + floats[i] = complexes[i] = val + as_int = int(val) seen_int = True + + seen_uint = seen_uint or (as_int > iINT64_MAX) + seen_sint = seen_sint or (as_int < 0) + + if check_uint64_nan() or check_uint64_int64_conflict(): + return values + + if seen_uint: + uints[i] = as_int + elif seen_sint: + ints[i] = as_int + else: + uints[i] = as_int + ints[i] = as_int elif util.is_bool_object(val): - floats[i] = ints[i] = bools[i] = val + floats[i] = uints[i] = ints[i] = bools[i] = val seen_bool = True elif val is None: + seen_null = True + if check_uint64_nan(): + return values + floats[i] = complexes[i] = nan seen_float = True elif hasattr(val, '__len__') and len(val) == 0: if convert_empty or coerce_numeric: + seen_null = True + if check_uint64_nan(): + return values + floats[i] = complexes[i] = nan seen_float = True else: @@ -686,24 +792,55 @@ def maybe_convert_numeric(object[:] values, set na_values, status = floatify(val, &fval, &maybe_int) if fval in na_values: + seen_null = True + if check_uint64_nan(): + return values + floats[i] = complexes[i] = nan seen_float = True else: + if fval != fval: + seen_null = True + if check_uint64_nan(): + return values + floats[i] = fval - if not seen_float: - if maybe_int: - as_int = int(val) + if maybe_int: + as_int = int(val) - if as_int <= iINT64_MAX and as_int >= iINT64_MIN: + if as_int in na_values: + seen_float = True + seen_null = True + else: + seen_uint = seen_uint or (as_int > iINT64_MAX) + seen_sint = seen_sint or (as_int < 0) + seen_int = True + + if check_uint64_nan() or check_uint64_int64_conflict(): + return values + + if not (seen_float or as_int in na_values): + if as_int < iINT64_MIN or as_int > iUINT64_MAX: + raise ValueError('Integer out of range.') + + if seen_uint: + uints[i] = as_int + elif seen_sint: ints[i] = as_int else: - raise ValueError('integer out of range') - else: - seen_float = True + uints[i] = as_int + ints[i] = as_int + else: + seen_float = True except (TypeError, ValueError) as e: if not coerce_numeric: raise type(e)(str(e) + ' at position {}'.format(i)) + elif "uint64" in str(e): # Exception from check functions. + raise + seen_null = True + if check_uint64_nan(): + return values floats[i] = nan seen_float = True @@ -713,9 +850,14 @@ def maybe_convert_numeric(object[:] values, set na_values, elif seen_float: return floats elif seen_int: - return ints + if seen_uint: + return uints + else: + return ints elif seen_bool: return bools.view(np.bool_) + elif seen_uint: + return uints return ints @@ -810,7 +952,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats[i] = val complexes[i] = val if not seen_null: - seen_uint = seen_uint or (int(val) > npy_int64_max) + seen_uint = seen_uint or (int(val) > iINT64_MAX) seen_sint = seen_sint or (val < 0) if seen_uint and seen_sint: diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index e79475ff91dca4..796c77354d6f82 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -255,6 +255,59 @@ def test_convert_non_hashable(self): result = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + def test_convert_numeric_uint64(self): + arr = np.array([2**63], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + + arr = np.array([str(2**63)], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + + arr = np.array([np.uint64(2**63)], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) + + def test_convert_numeric_uint64_nan(self): + msg = 'uint64 array detected' + cases = [(np.array([2**63, np.nan], dtype=object), set()), + (np.array([str(2**63), np.nan], dtype=object), set()), + (np.array([np.nan, 2**63], dtype=object), set()), + (np.array([np.nan, str(2**63)], dtype=object), set()), + (np.array([2**63, 2**63 + 1], dtype=object), set([2**63])), + (np.array([str(2**63), str(2**63 + 1)], + dtype=object), set([2**63]))] + + for coerce in (True, False): + for arr, na_values in cases: + if coerce: + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric(arr, na_values, + coerce_numeric=coerce) + else: + tm.assert_numpy_array_equal(lib.maybe_convert_numeric( + arr, na_values), arr) + + def test_convert_numeric_int64_uint64(self): + msg = 'uint64 and negative values detected' + cases = [np.array([2**63, -1], dtype=object), + np.array([str(2**63), -1], dtype=object), + np.array([str(2**63), str(-1)], dtype=object), + np.array([-1, 2**63], dtype=object), + np.array([-1, str(2**63)], dtype=object), + np.array([str(-1), str(2**63)], dtype=object)] + + for coerce in (True, False): + for case in cases: + if coerce: + with tm.assertRaisesRegexp(ValueError, msg): + print(case) + lib.maybe_convert_numeric(case, set(), + coerce_numeric=coerce) + else: + tm.assert_numpy_array_equal(lib.maybe_convert_numeric( + case, set()), case) + def test_maybe_convert_objects_uint64(self): # see gh-4471 arr = np.array([2**63], dtype=object)