Skip to content

Commit

Permalink
DOC, TST, BUG: Improve uint64 core/algos behavior
Browse files Browse the repository at this point in the history
1) duplicated()

Updates documentation to describe the "values"
parameter in the signature, adds tests for uint64,
and refactors to use duplicated_uint64.

2) mode()

Updates documentation to describe the "values"
parameter in the signature, adds tests for uint64,
and reactors to use mode_uint64.

3) unique()

Uses UInt64HashTable to patch a uint64 overflow bug
analogous to that seen in Series.unique (patched in
pandas-devgh-14915).

4) Types API

Introduces "is_signed_integer_dtype" and "is_unsigned
_integer_dtype" to the public API. Used in refactoring/
patching of 1-3.
  • Loading branch information
gfyoung committed Dec 22, 2016
1 parent f79bc7a commit 588f03b
Show file tree
Hide file tree
Showing 9 changed files with 343 additions and 117 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Expand Up @@ -298,5 +298,6 @@ Bug Fixes


- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
6 changes: 3 additions & 3 deletions pandas/api/tests/test_api.py
Expand Up @@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
'is_floating_dtype', 'is_int64_dtype', 'is_integer',
'is_integer_dtype', 'is_number', 'is_numeric_dtype',
'is_object_dtype', 'is_scalar', 'is_sparse',
'is_string_dtype',
'is_string_dtype', 'is_signed_integer_dtype',
'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
'is_period', 'is_period_dtype',
'is_re', 'is_re_compilable',
'is_unsigned_integer_dtype', 'is_period',
'is_period_dtype', 'is_re', 'is_re_compilable',
'is_dict_like', 'is_iterator',
'is_list_like', 'is_hashable',
'is_named_tuple', 'is_sequence',
Expand Down
37 changes: 30 additions & 7 deletions pandas/core/algorithms.py
Expand Up @@ -9,7 +9,9 @@
from pandas import compat, lib, tslib, _np_version_under1p8
from pandas.types.cast import _maybe_promote
from pandas.types.generic import ABCSeries, ABCIndex
from pandas.types.common import (is_integer_dtype,
from pandas.types.common import (is_unsigned_integer_dtype,
is_signed_integer_dtype,
is_integer_dtype,
is_int64_dtype,
is_categorical_dtype,
is_extension_type,
Expand Down Expand Up @@ -490,12 +492,14 @@ def _value_counts_arraylike(values, dropna=True):

def duplicated(values, keep='first'):
"""
Return boolean ndarray denoting duplicate values
Return boolean ndarray denoting duplicate values.
.. versionadded:: 0.19.0
Parameters
----------
values : ndarray-like
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
Expand All @@ -521,9 +525,12 @@ def duplicated(values, keep='first'):
elif isinstance(values, (ABCSeries, ABCIndex)):
values = values.values

if is_integer_dtype(dtype):
if is_signed_integer_dtype(dtype):
values = _ensure_int64(values)
duplicated = htable.duplicated_int64(values, keep=keep)
elif is_unsigned_integer_dtype(dtype):
values = _ensure_uint64(values)
duplicated = htable.duplicated_uint64(values, keep=keep)
elif is_float_dtype(dtype):
values = _ensure_float64(values)
duplicated = htable.duplicated_float64(values, keep=keep)
Expand All @@ -535,7 +542,19 @@ def duplicated(values, keep='first'):


def mode(values):
"""Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
"""
Returns the mode(s) of an array.
Parameters
----------
values : array-like
Array over which to check for duplicate values.
Returns
-------
mode : Series
"""

# must sort because hash order isn't necessarily defined.
from pandas.core.series import Series

Expand All @@ -547,10 +566,12 @@ def mode(values):
constructor = Series

dtype = values.dtype
if is_integer_dtype(values):
if is_signed_integer_dtype(values):
values = _ensure_int64(values)
result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)

elif is_unsigned_integer_dtype(values):
values = _ensure_uint64(values)
result = constructor(sorted(htable.mode_uint64(values)), dtype=dtype)
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
dtype = values.dtype
values = values.view(np.int64)
Expand Down Expand Up @@ -893,8 +914,10 @@ def _hashtable_algo(f, values, return_dtype=None):
dtype = values.dtype
if is_float_dtype(dtype):
return f(htable.Float64HashTable, _ensure_float64)
elif is_integer_dtype(dtype):
elif is_signed_integer_dtype(dtype):
return f(htable.Int64HashTable, _ensure_int64)
elif is_unsigned_integer_dtype(dtype):
return f(htable.UInt64HashTable, _ensure_uint64)
elif is_datetime64_dtype(dtype):
return_dtype = return_dtype or 'M8[ns]'
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
Expand Down
67 changes: 0 additions & 67 deletions pandas/hashtable.pyx
Expand Up @@ -167,73 +167,6 @@ cpdef value_count_object(ndarray[object] values,
return result_keys, result_counts


@cython.wraparound(False)
@cython.boundscheck(False)
def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
ndarray[object] modes
kh_pymap_t *table

table = kh_init_pymap()
build_count_table_object(values, mask, table)

modes = np.empty(table.n_buckets, dtype=np.object_)
for k in range(table.n_buckets):
if kh_exist_pymap(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = <object> table.keys[k]

kh_destroy_pymap(table)

return modes[:j + 1]


@cython.wraparound(False)
@cython.boundscheck(False)
def mode_int64(int64_t[:] values):
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
kh_int64_t *table
ndarray[int64_t] modes

table = kh_init_int64()

build_count_table_int64(values, table, 0)

modes = np.empty(table.n_buckets, dtype=np.int64)

with nogil:
for k in range(table.n_buckets):
if kh_exist_int64(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue
modes[j] = table.keys[k]

kh_destroy_int64(table)

return modes[:j + 1]


@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_object(ndarray[object] values, object keep='first'):
Expand Down
82 changes: 82 additions & 0 deletions pandas/src/hashtable_func_helper.pxi.in
Expand Up @@ -112,3 +112,85 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values,
return out

{{endfor}}

#----------------------------------------------------------------------
# Mode Computations
#----------------------------------------------------------------------

{{py:

# dtype, ctype, table_type, npy_dtype
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
('uint64', 'uint64_t', 'uint64', 'uint64'),
('object', 'object', 'pymap', 'object_')]
}}

{{for dtype, ctype, table_type, npy_dtype in dtypes}}


@cython.wraparound(False)
@cython.boundscheck(False)

{{if dtype == 'object'}}


def mode_{{dtype}}(ndarray[{{ctype}}] values,
ndarray[uint8_t, cast=True] mask):
{{else}}


def mode_{{dtype}}({{ctype}}[:] values):
{{endif}}
cdef:
int count, max_count = 2
int j = -1 # so you can do +=
int k
kh_{{table_type}}_t *table
ndarray[{{ctype}}] modes

table = kh_init_{{table_type}}()

{{if dtype == 'object'}}
build_count_table_{{dtype}}(values, mask, table)
{{else}}
build_count_table_{{dtype}}(values, table, 0)
{{endif}}

modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})

{{if dtype != 'object'}}
with nogil:
for k in range(table.n_buckets):
if kh_exist_{{table_type}}(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue

modes[j] = table.keys[k]
{{else}}
for k in range(table.n_buckets):
if kh_exist_{{table_type}}(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
max_count = count
j = 0
else:
continue

modes[j] = <object> table.keys[k]
{{endif}}

kh_destroy_{{table_type}}(table)

return modes[:j + 1]

{{endfor}}

0 comments on commit 588f03b

Please sign in to comment.