Skip to content

Commit

Permalink
BUG: Better handle larger numbers in to_numeric
Browse files Browse the repository at this point in the history
* Warn about lossiness when passing really large
numbers that exceed (u)int64 ranges.

* Coerce negative numbers to float when requested
instead of crashing and returning object.

* Consistently parse numbers as integers / floats,
even if we know that the resulting container has
to be float. This is to ensure consistent error
behavior when inputs numbers are too large.

Closes pandas-devgh-24910.
  • Loading branch information
gfyoung committed Jan 27, 2019
1 parent 95f8dca commit 56b6921
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 15 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ Timezones
Numeric
^^^^^^^

- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`)
- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`)
-
-
-
Expand Down
25 changes: 15 additions & 10 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
except (ValueError, OverflowError, TypeError):
pass

# otherwise, iterate and do full infererence
# Otherwise, iterate and do full inference.
cdef:
int status, maybe_int
Py_ssize_t i, n = values.size
Expand Down Expand Up @@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
else:
seen.float_ = True

if val <= oINT64_MAX:
if oINT64_MIN <= val <= oINT64_MAX:
ints[i] = val

if seen.sint_ and seen.uint_:
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
seen.float_ = True

elif util.is_bool_object(val):
Expand Down Expand Up @@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
else:
seen.saw_int(as_int)

if not (seen.float_ or as_int in na_values):
if as_int not in na_values:
if as_int < oINT64_MIN or as_int > oUINT64_MAX:
raise ValueError('Integer out of range.')
if seen.coerce_numeric:
seen.float_ = True
else:
raise ValueError("Integer out of range.")
else:
if as_int >= 0:
uints[i] = as_int

if as_int >= 0:
uints[i] = as_int
if as_int <= oINT64_MAX:
ints[i] = as_int
if as_int <= oINT64_MAX:
ints[i] = as_int

seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
else:
seen.float_ = True
except (TypeError, ValueError) as e:
if not seen.coerce_numeric:
raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
raise type(e)(str(e) + " at position {pos}".format(pos=i))
elif "uint64" in str(e): # Exception from check functions.
raise

seen.saw_null()
floats[i] = NaN

Expand Down
8 changes: 8 additions & 0 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None):
depending on the data supplied. Use the `downcast` parameter
to obtain other dtypes.
Please note that precision loss may occur if really large numbers
are passed in. Due to the internal limitations of `ndarray`, if
numbers smaller than `-9223372036854775808` or larger than
`18446744073709551615` are passed in, it is very likely they
will be converted to float so that they can stored in an `ndarray`.
These warnings apply similarly to `Series` since it internally
leverages `ndarray`.
Parameters
----------
arg : scalar, list, tuple, 1-d array, or Series
Expand Down
128 changes: 123 additions & 5 deletions pandas/tests/tools/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from numpy import iinfo
import pytest

import pandas.compat as compat

import pandas as pd
from pandas import DataFrame, Index, Series, to_numeric
from pandas.util import testing as tm
Expand Down Expand Up @@ -172,7 +174,11 @@ def test_all_nan():
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
def errors(request):
return request.param


def test_type_check(errors):
# see gh-11776
df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
Expand All @@ -183,11 +189,123 @@ def test_type_check(errors):
to_numeric(df, **kwargs)


@pytest.mark.parametrize("val", [
1, 1.1, "1", "1.1", -1.5, "-1.5"
@pytest.fixture(params=[True, False])
def signed(request):
return request.param


@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
def transform(request):
return request.param


@pytest.mark.parametrize("val", [1, 1.1, 20001])
def test_scalar(val, signed, transform):
val = -val if signed else val
assert to_numeric(transform(val)) == float(val)


@pytest.fixture(params=[
47393996303418497800,
100000000000000000000
])
def test_scalar(val):
assert to_numeric(val) == float(val)
def large_val(request):
return request.param


def test_really_large_scalar(large_val, signed, transform, errors):
# see gh-24910
kwargs = dict(errors=errors) if errors is not None else dict()
val = -large_val if signed else large_val

val = transform(val)
val_is_string = isinstance(val, str)

if val_is_string and errors in (None, "raise"):
msg = "Integer out of range. at position 0"
with pytest.raises(ValueError, match=msg):
to_numeric(val, **kwargs)
else:
expected = float(val) if (errors == "coerce" and
val_is_string) else val
assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected)


@pytest.fixture(params=[True, False])
def multiple_elts(request):
return request.param


def test_really_large_in_arr(large_val, signed, transform,
multiple_elts, errors):
# see gh-24910
kwargs = dict(errors=errors) if errors is not None else dict()
val = -large_val if signed else large_val
val = transform(val)

extra_elt = "string"
arr = [val] + multiple_elts * [extra_elt]

val_is_string = isinstance(val, str)
coercing = errors == "coerce"

if errors in (None, "raise") and (val_is_string or multiple_elts):
if val_is_string:
msg = "Integer out of range. at position 0"
else:
msg = 'Unable to parse string "string" at position 1'

with pytest.raises(ValueError, match=msg):
to_numeric(arr, **kwargs)
else:
result = to_numeric(arr, **kwargs)

exp_val = float(val) if (coercing and val_is_string) else val
expected = [exp_val]

if multiple_elts:
if coercing:
expected.append(np.nan)
exp_dtype = float
else:
expected.append(extra_elt)
exp_dtype = object
else:
exp_dtype = float if isinstance(exp_val, (
int, compat.long, float)) else object

tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


def test_really_large_in_arr_consistent(large_val, signed,
multiple_elts, errors):
# see gh-24910
#
# Even if we discover that we have to hold float, does not mean
# we should be lenient on subsequent elements that fail to be integer.
kwargs = dict(errors=errors) if errors is not None else dict()
arr = [str(-large_val if signed else large_val)]

if multiple_elts:
arr.insert(0, large_val)

if errors in (None, "raise"):
index = int(multiple_elts)
msg = "Integer out of range. at position {index}".format(index=index)

with pytest.raises(ValueError, match=msg):
to_numeric(arr, **kwargs)
else:
result = to_numeric(arr, **kwargs)

if errors == "coerce":
expected = [float(i) for i in arr]
exp_dtype = float
else:
expected = arr
exp_dtype = object

tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


@pytest.mark.parametrize("errors,checker", [
Expand Down

0 comments on commit 56b6921

Please sign in to comment.