CLN: Refactor cython to use memory views (pandas-dev#24932)

forking-repos · Jan 26, 2019 · 95f8dca · 95f8dca
1 parent 602eda4
commit 95f8dca
Show file tree

Hide file tree

Showing 21 changed files with 240 additions and 214 deletions.
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -76,7 +76,7 @@ class NegInfinity(object):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cpdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr):
+cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
     """
     Efficiently find the unique first-differences of the given array.
 
@@ -150,7 +150,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
+def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups):
     """
     compute a 1-d indexer that is an ordering of the passed index,
     ordered by the groups. This is a reverse of the label
@@ -230,7 +230,7 @@ def kth_smallest(numeric[:] a, Py_ssize_t k) -> numeric:
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None):
+def nancorr(const float64_t[:, :] mat, bint cov=0, minp=None):
     cdef:
         Py_ssize_t i, j, xi, yi, N, K
         bint minpv
@@ -294,7 +294,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, bint cov=0, minp=None):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
+def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1):
     cdef:
         Py_ssize_t i, j, xi, yi, N, K
         ndarray[float64_t, ndim=2] result
@@ -435,8 +435,8 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_inplace(ndarray[algos_t] values,
-                ndarray[uint8_t, cast=True] mask,
+def pad_inplace(algos_t[:] values,
+                const uint8_t[:] mask,
                 limit=None):
     cdef:
         Py_ssize_t i, N
@@ -472,8 +472,8 @@ def pad_inplace(ndarray[algos_t] values,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def pad_2d_inplace(ndarray[algos_t, ndim=2] values,
-                   ndarray[uint8_t, ndim=2] mask,
+def pad_2d_inplace(algos_t[:, :] values,
+                   const uint8_t[:, :] mask,
                    limit=None):
     cdef:
         Py_ssize_t i, j, N, K
@@ -602,8 +602,8 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_inplace(ndarray[algos_t] values,
-                     ndarray[uint8_t, cast=True] mask,
+def backfill_inplace(algos_t[:] values,
+                     const uint8_t[:] mask,
                      limit=None):
     cdef:
         Py_ssize_t i, N
@@ -639,8 +639,8 @@ def backfill_inplace(ndarray[algos_t] values,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def backfill_2d_inplace(ndarray[algos_t, ndim=2] values,
-                        ndarray[uint8_t, ndim=2] mask,
+def backfill_2d_inplace(algos_t[:, :] values,
+                        const uint8_t[:, :] mask,
                         limit=None):
     cdef:
         Py_ssize_t i, j, N, K
@@ -678,7 +678,7 @@ def backfill_2d_inplace(ndarray[algos_t, ndim=2] values,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def arrmap(ndarray[algos_t] index, object func):
+def arrmap(algos_t[:] index, object func):
     cdef:
         Py_ssize_t length = index.shape[0]
         Py_ssize_t i = 0

diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -29,10 +29,10 @@ def get_dispatch(dtypes):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{c_type}}, ndim=2] values,
-                       ndarray[int64_t] labels,
+def group_add_{{name}}({{c_type}}[:, :] out,
+                       int64_t[:] counts,
+                       {{c_type}}[:, :] values,
+                       const int64_t[:] labels,
                        Py_ssize_t min_count=0):
     """
     Only aggregates on axis=0
@@ -76,10 +76,10 @@ def group_add_{{name}}(ndarray[{{c_type}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[{{c_type}}, ndim=2] values,
-                        ndarray[int64_t] labels,
+def group_prod_{{name}}({{c_type}}[:, :] out,
+                        int64_t[:] counts,
+                        {{c_type}}[:, :] values,
+                        const int64_t[:] labels,
                         Py_ssize_t min_count=0):
     """
     Only aggregates on axis=0
@@ -123,10 +123,10 @@ def group_prod_{{name}}(ndarray[{{c_type}}, ndim=2] out,
 @cython.wraparound(False)
 @cython.boundscheck(False)
 @cython.cdivision(True)
-def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{c_type}}, ndim=2] values,
-                       ndarray[int64_t] labels,
+def group_var_{{name}}({{c_type}}[:, :] out,
+                       int64_t[:] counts,
+                       {{c_type}}[:, :] values,
+                       const int64_t[:] labels,
                        Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -175,10 +175,10 @@ def group_var_{{name}}(ndarray[{{c_type}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[{{c_type}}, ndim=2] values,
-                        ndarray[int64_t] labels,
+def group_mean_{{name}}({{c_type}}[:, :] out,
+                        int64_t[:] counts,
+                        {{c_type}}[:, :] values,
+                        const int64_t[:] labels,
                         Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
@@ -220,11 +220,11 @@ def group_mean_{{name}}(ndarray[{{c_type}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_ohlc_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[{{c_type}}, ndim=2] values,
-                  ndarray[int64_t] labels,
-                  Py_ssize_t min_count=-1):
+def group_ohlc_{{name}}({{c_type}}[:, :] out,
+                        int64_t[:] counts,
+                        {{c_type}}[:, :] values,
+                        const int64_t[:] labels,
+                        Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
@@ -293,10 +293,10 @@ def get_dispatch(dtypes):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                        ndarray[int64_t] counts,
-                        ndarray[{{c_type}}, ndim=2] values,
-                        ndarray[int64_t] labels,
+def group_last_{{name}}({{c_type}}[:, :] out,
+                        int64_t[:] counts,
+                        {{c_type}}[:, :] values,
+                        const int64_t[:] labels,
                         Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -350,10 +350,10 @@ def group_last_{{name}}(ndarray[{{c_type}}, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out,
-                       ndarray[int64_t] counts,
-                       ndarray[{{c_type}}, ndim=2] values,
-                       ndarray[int64_t] labels, int64_t rank,
+def group_nth_{{name}}({{c_type}}[:, :] out,
+                       int64_t[:] counts,
+                       {{c_type}}[:, :] values,
+                       const int64_t[:] labels, int64_t rank,
                        Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -411,9 +411,9 @@ def group_nth_{{name}}(ndarray[{{c_type}}, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
-                        ndarray[{{c_type}}, ndim=2] values,
-                        ndarray[int64_t] labels,
+def group_rank_{{name}}(float64_t[:, :] out,
+                        {{c_type}}[:, :] values,
+                        const int64_t[:] labels,
                         bint is_datetimelike, object ties_method,
                         bint ascending, bint pct, object na_option):
     """
@@ -606,10 +606,10 @@ ctypedef fused groupby_t:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max(ndarray[groupby_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[groupby_t, ndim=2] values,
-              ndarray[int64_t] labels,
+def group_max(groupby_t[:, :] out,
+              int64_t[:] counts,
+              groupby_t[:, :] values,
+              const int64_t[:] labels,
               Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -669,10 +669,10 @@ def group_max(ndarray[groupby_t, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min(ndarray[groupby_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[groupby_t, ndim=2] values,
-              ndarray[int64_t] labels,
+def group_min(groupby_t[:, :] out,
+              int64_t[:] counts,
+              groupby_t[:, :] values,
+              const int64_t[:] labels,
               Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
@@ -731,9 +731,9 @@ def group_min(ndarray[groupby_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummin(ndarray[groupby_t, ndim=2] out,
-                 ndarray[groupby_t, ndim=2] values,
-                 ndarray[int64_t] labels,
+def group_cummin(groupby_t[:, :] out,
+                 groupby_t[:, :] values,
+                 const int64_t[:] labels,
                  bint is_datetimelike):
     """
     Only transforms on axis=0
@@ -779,9 +779,9 @@ def group_cummin(ndarray[groupby_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummax(ndarray[groupby_t, ndim=2] out,
-                 ndarray[groupby_t, ndim=2] values,
-                 ndarray[int64_t] labels,
+def group_cummax(groupby_t[:, :] out,
+                 groupby_t[:, :] values,
+                 const int64_t[:] labels,
                  bint is_datetimelike):
     """
     Only transforms on axis=0

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
@@ -52,9 +52,10 @@ include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 
 cdef class Factorizer:
-    cdef public PyObjectHashTable table
-    cdef public ObjectVector uniques
-    cdef public Py_ssize_t count
+    cdef public:
+        PyObjectHashTable table
+        ObjectVector uniques
+        Py_ssize_t count
 
     def __init__(self, size_hint):
         self.table = PyObjectHashTable(size_hint)
@@ -96,9 +97,10 @@ cdef class Factorizer:
 
 
 cdef class Int64Factorizer:
-    cdef public Int64HashTable table
-    cdef public Int64Vector uniques
-    cdef public Py_ssize_t count
+    cdef public:
+        Int64HashTable table
+        Int64Vector uniques
+        Py_ssize_t count
 
     def __init__(self, size_hint):
         self.table = Int64HashTable(size_hint)
@@ -140,7 +142,7 @@ cdef class Int64Factorizer:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def unique_label_indices(ndarray[int64_t, ndim=1] labels):
+def unique_label_indices(const int64_t[:] labels):
     """
     indices of the first occurrences of the unique labels
     *excluding* -1. equivalent to:
@@ -168,6 +170,6 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels):
     kh_destroy_int64(table)
 
     arr = idx.to_array()
-    arr = arr[labels[arr].argsort()]
+    arr = arr[np.asarray(labels)[arr].argsort()]
 
     return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -322,7 +322,7 @@ cdef class {{name}}HashTable(HashTable):
                 self.table.vals[k] = <Py_ssize_t>values[i]
 
     @cython.boundscheck(False)
-    def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values):
+    def map_locations(self, const {{dtype}}_t[:] values):
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -23,10 +23,11 @@ from pandas._libs.algos import ensure_int64
 
 cdef class BlockPlacement:
     # __slots__ = '_as_slice', '_as_array', '_len'
-    cdef slice _as_slice
-    cdef object _as_array
+    cdef:
+        slice _as_slice
+        object _as_array
 
-    cdef bint _has_slice, _has_array, _is_known_slice_like
+        bint _has_slice, _has_array, _is_known_slice_like
 
     def __init__(self, val):
         cdef: