In [2]:
%load_ext Cython

In [3]:
import numpy as np
import pandas as pd

In [4]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [5]:
# v1 基础模板
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [8]:
print(result)

NameError: name 'result' is not defined

In [11]:
# %%timeit
result = target_mean_v1(data, 'y', 'x')
print(result)

[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]


In [9]:
# v2 不使用pandas
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    map_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        l = data.iloc[i]
        count_dict[l[x_name]] = count_dict.get(l[x_name], 0) + 1
        map_dict[l[x_name]] = map_dict.get(l[x_name], 0) + l[y_name]
    for i in range(data.shape[0]):
        l = data.iloc[i]
        result[i] = (map_dict.get(l[x_name])-l[y_name])/(count_dict[l[x_name]]-1)
    return result

In [18]:
%%timeit
result = target_mean_v2(data, 'y', 'x')
print(result)

[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
1.08 s ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%cython -a

# v3 引入cython 部分变量可强类型化
# 关闭 boundscheck
# 但是发现并没有什么用


cimport cython
cimport numpy as cnp
# cimport pandas as cpd
import numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v3(data, y_name, x_name):
    cdef cnp.ndarray[double] result = np.zeros(data.shape[0])
    cdef dict map_dict = dict()
    cdef dict count_dict = dict()
    for i in range(data.shape[0]):
        l = data.iloc[i]
        count_dict[l[x_name]] = count_dict.get(l[x_name], 0) + 1
        map_dict[l[x_name]] = map_dict.get(l[x_name], 0) + l[y_name]
    for i in range(data.shape[0]):
        l = data.iloc[i]
        result[i] = (map_dict.get(l[x_name])-l[y_name])/(count_dict[l[x_name]]-1)
    return result

In [16]:
%%timeit
result = target_mean_v3(data, 'y', 'x')
print(result)

[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
[0.48       0.48268839 0.50961538 ... 0.49897751 0.4283054  0.51698113]
1.18 s ± 110 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%cython -a

# 拆解DataFrame数据接口，变成两个cnp.ndarray[long]
# 效果明显

cimport cython
cimport numpy as cnp
# cimport pandas as cpd
import numpy as np
import pandas as pd

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v4(data, y_name, x_name):
    cdef cnp.ndarray[double] result = np.zeros(data.shape[0])
    cdef cnp.ndarray[long] xnda = data[x_name].values
    cdef cnp.ndarray[long] ynda = data[y_name].values
    cdef dict map_dict = dict()
    cdef dict count_dict = dict()
    for i in range(data.shape[0]):
        count_dict[xnda[i]] = count_dict.get(xnda[i], 0) + 1
        map_dict[xnda[i]] = map_dict.get(xnda[i], 0) + ynda[i]
    for i in range(data.shape[0]):
        result[i] = (map_dict.get(xnda[i])-ynda[i])/(count_dict[xnda[i]]-1)
    return result

In [24]:
%%timeit
result = target_mean_v4(data, 'y', 'x')
# print(result)

15.7 ms ± 633 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
