# Task Desc
  use Cython to accelerate func `target_mean_v1`.
  
  **result**
  - origin:         13.8 s ± 293 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
  - remove groupby: 198 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
  - use `Cython`:     27.4 µs ± 813 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
  - use `prange`:     27.9 µs ± 332 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)

## load Cython

## prepare data

In [4]:
import pandas as pd
import numpy as np

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

## baseline version, target_mean_v1

In [7]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

## remove `groupby`

In [8]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [9]:
%%timeit
target_mean_v1(data, 'y', 'x')

13.8 s ± 293 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
target_mean_v2(data, 'y', 'x')

187 ms ± 4.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## use Cython to accelerate

In [34]:
%%cython -a
# distutils: extra_compile_args=-fopenmp
# distutils: extra_link_args=-fopenmp

import time
import numpy as np
cimport numpy as cnp
cimport cython


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v3(data, y_name, x_name):
    X = data[x_name].values
    Y = data[y_name].values
    cdef Py_ssize_t N = X.shape[0] 
    v_dict = np.zeros(N, dtype=np.int)
    c_dict = np.zeros(N, dtype=np.int)
    cdef long[::1] value_dict = v_dict
    cdef long[::1] count_dict = c_dict

    cdef long[:] x_pt = X 
    cdef long[:] y_pt = Y
    cdef Py_ssize_t i
    cdef long x_id
    for i in range(N):
        x_id = x_pt[i]
        value_dict[x_id] += y_pt[i]
        count_dict[x_id] += 1
    
    cdef cnp.ndarray[double, ndim=1] result_pt = np.zeros(N, dtype=np.float64)
    
    for i in range(N): 
        x_id = x_pt[i]
        result_pt[i] = (value_dict[x_id] - y_pt[i]) / (count_dict[x_id] - 1)
    
    return result_pt


In [35]:
%%timeit
target_mean_v3(data, 'y', 'x')

24.5 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [36]:
result_2 = target_mean_v2(data, 'y', 'x')
result_3 = target_mean_v3(data, 'y', 'x')
diff = np.linalg.norm(result_2 - result_3)
print(diff)

0.0


## use prange of Cython to run parallel

In [37]:
%%cython -a

import numpy as np 
cimport numpy as cnp
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v4(data, y_name, x_name):
    
    cdef int shape = data.shape[0]
    
    cdef cnp.ndarray[double, ndim=1] result = np.zeros(shape, dtype=np.float64)
    cdef cnp.ndarray[double, ndim=1] value_arr = np.zeros(10)
    cdef cnp.ndarray[double, ndim=1] count_arr = np.zeros(10)
    
    cdef cnp.ndarray[long] x_values = data[x_name].values
    cdef cnp.ndarray[long] y_values = data[y_name].values

    cdef int i = 0
    # without the GIL
    for i in prange(shape, nogil=True):
        value_arr[x_values[i]] += y_values[i]
        count_arr[x_values[i]] += 1

    for i in prange(shape, nogil=True):
        result[i] = (value_arr[x_values[i]] - y_values[i]) / (count_arr[x_values[i]] - 1)
    return result

In [38]:
result_2 = target_mean_v2(data, 'y', 'x')
result_3 = target_mean_v3(data, 'y', 'x')
result_4 = target_mean_v4(data, 'y', 'x')
diff23 = np.linalg.norm(result_2 - result_3)
diff24 = np.linalg.norm(result_2 - result_4)
print(diff23)
print(diff24)

0.0
0.0


In [31]:
# %%timeit -r 100
%timeit -n 100 target_mean_v2(data, 'y', 'x')
%timeit -n 100 target_mean_v3(data, 'y', 'x')
%timeit -n 100 target_mean_v4(data, 'y', 'x')

198 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
27.4 µs ± 813 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
27.9 µs ± 332 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)
