In [1]:
import numpy as np
import pandas as pd

In [99]:
np.random.seed(0)
x = np.random.randint(10, size=(5000,1))
y = np.random.randint(2, size=(5000,1))

data = pd.DataFrame(np.concatenate([y,x],axis=1), columns=['y','x'])
data.head()

Unnamed: 0,y,x
0,1,5
1,1,0
2,0,3
3,0,3
4,1,7


# 1. 讲义中的 v2 版本

In [171]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
#     print(value_dict)
#     print(count_dict)
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [74]:
target_mean_v2(data, 'y', 'x')

array([0.53831041, 0.51089109, 0.45155039, ..., 0.47704591, 0.52410901,
       0.55331992])

# 2. Python 原生优化

## v3 提取变量优化
y, x = data.loc[i] 慢

In [172]:
def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        y, x = data.loc[i]
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[x] = y
            count_dict[x] = 1
        else:
            value_dict[x] += y
            count_dict[x] += 1
#     print(value_dict)
#     print(count_dict)
    for i in range(data.shape[0]):
        y, x = data.loc[i]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
    return result

In [173]:
target_mean_v3(data, 'y', 'x')

array([0.53831041, 0.51089109, 0.45155039, ..., 0.47704591, 0.52410901,
       0.55331992])

## v3a 提取变量优化
y, x = data.loc[i] 慢，y, x = data.loc[i, y_name],data.loc[i, x_name] 快，why？

In [174]:
def target_mean_v3a(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[x] = y
            count_dict[x] = 1
        else:
            value_dict[x] += y
            count_dict[x] += 1
#     print(value_dict)
#     print(count_dict)
    for i in range(data.shape[0]):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
    return result

In [175]:
%timeit target_mean_v2(data, 'y', 'x')
%timeit target_mean_v3(data, 'y', 'x')
%timeit target_mean_v3a(data, 'y', 'x')

377 ms ± 16.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.39 s ± 57.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
274 ms ± 19.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## v3b 尝试 defaultdict （时快时慢）

In [177]:
from collections import defaultdict

def target_mean_v3b(data, y_name, x_name):
    length = data.shape[0]
    result = np.zeros(length)
    value_dict = defaultdict(int)
    count_dict = defaultdict(int)
    for i in range(length):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        value_dict[x] += y
        count_dict[x] += 1

    for i in range(length):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

## v3c 尝试 setdefault（变快）

In [179]:
def target_mean_v3c(data, y_name, x_name):
    length = data.shape[0]
    result = np.zeros(length)
    value_dict = {}
    count_dict = {}
    for i in range(length):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        value_dict[x] = value_dict.setdefault(x, 0) + y
        count_dict[x] = count_dict.setdefault(x, 0) + 1

    for i in range(length):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

In [90]:
target_mean_v3c(data, 'y', 'x')

array([0.53831041, 0.51089109, 0.45155039, ..., 0.47704591, 0.52410901,
       0.55331992])

In [181]:
%timeit target_mean_v3a(data, 'y', 'x')
%timeit target_mean_v3b(data, 'y', 'x')
%timeit target_mean_v3c(data, 'y', 'x')

# defaultdict 简化了代码，但速度变慢了, setdefault 简化了代码，且速度变快了

256 ms ± 7.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
222 ms ± 8.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
212 ms ± 5.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## v3d 尝试 groupby（简化了代码，但变慢了）

In [45]:
# 计算总体的 sum 和count
groupby_result = data.groupby(['x'], as_index=False).agg(['sum', 'count'])
groupby_result

Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,sum,count
x,Unnamed: 1_level_2,Unnamed: 2_level_2
0,259,506
1,247,502
2,251,478
3,233,517
4,241,490
5,275,510
6,240,502
7,260,491
8,271,506
9,276,498


In [59]:
sum, count = groupby_result.loc[9]
sum, count

(276, 498)

In [93]:
from collections import defaultdict

def target_mean_v3d(data, y_name, x_name):
    length = data.shape[0]
    result = np.zeros(length)
    groupby_result = data.groupby(['x'], as_index=False).agg(['sum', 'count'])

    for i in range(length):
        y, x = data.loc[i, y_name],data.loc[i, x_name]
        sum_total, count_total = groupby_result.loc[x]
        result[i] = (sum_total - y) / (count_total - 1)
    return result

In [94]:
target_mean_v3d(data, 'y', 'x')

array([0.53831041, 0.51089109, 0.45155039, ..., 0.47704591, 0.52410901,
       0.55331992])

In [182]:
%timeit target_mean_v3d(data, 'y', 'x')

889 ms ± 32.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## v4 DataFrame中的loc操作较慢,使用原始数组替换

In [186]:
def target_mean_v4(data, y_name, x_name):
    length = data.shape[0]
    result = np.zeros(length)
    xs, ys = data[x_name].tolist(), data[y_name].tolist()
    value_dict = {}
    count_dict = {}
    for i in range(length):
        y, x = ys[i], xs[i]
        value_dict[x] = value_dict.setdefault(x, 0) + y
        count_dict[x] = count_dict.setdefault(x, 0) + 1

    for i in range(length):
        y, x = ys[i], xs[i]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

In [187]:
def target_mean_v4a(data, y_name, x_name):
    length = data.shape[0]
    result = np.zeros(length)
    xs, ys = data[x_name].values, data[y_name].values
    value_dict = {}
    count_dict = {}
    for i in range(length):
        y, x = ys[i], xs[i]
        value_dict[x] = value_dict.setdefault(x, 0) + y
        count_dict[x] = count_dict.setdefault(x, 0) + 1

    for i in range(length):
        y, x = ys[i], xs[i]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

In [129]:
data['x'].values[:10]
data['x'].tolist()[:10]

[5, 0, 3, 3, 7, 9, 3, 5, 2, 4]

In [188]:
%timeit target_mean_v4(data, 'y', 'x')
%timeit target_mean_v4a(data, 'y', 'x')
# tolist 比 values 更快

4.16 ms ± 204 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
11.2 ms ± 311 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# 3. 使用 Cython 优化

In [110]:
%load_ext Cython

## cython 指定变量类型

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp

cpdef target_mean_v5_cython(data, str y_name, str x_name):
    cdef int length = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(length)
    xs = data[x_name].tolist()
    ys = data[y_name].tolist()
    value_dict = dict()
    count_dict = dict()

    for i in range(length):
        y, x = ys[i], xs[i]
        value_dict[x] = value_dict.setdefault(x, 0) + y
        count_dict[x] = count_dict.setdefault(x, 0) + 1

    for i in range(length):
        y, x = ys[i], xs[i]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp

cpdef target_mean_v5a_cython(data, str y_name, str x_name):
    cdef int length = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(length)
    cdef cnp.ndarray[long] xs = data[x_name].values
    cdef cnp.ndarray[long] ys = data[y_name].values
    value_dict = dict()
    count_dict = dict()

    for i in range(length):
        y, x = ys[i], xs[i]
        value_dict[x] = value_dict.setdefault(x, 0) + y
        count_dict[x] = count_dict.setdefault(x, 0) + 1

    for i in range(length):
        y, x = ys[i], xs[i]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

In [149]:
%timeit target_mean_v5_cython(data, 'y', 'x')
%timeit target_mean_v5a_cython(data, 'y', 'x')
# tolist 比 values 更快，但使用 cython 后 values 更快

1.64 ms ± 33.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.3 ms ± 29.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## 去除类型检查、包装检查（没有加快）

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5b_cython(data, str y_name, str x_name):
    cdef int length = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(length)
    cdef cnp.ndarray[long] xs = data[x_name].values
    cdef cnp.ndarray[long] ys = data[y_name].values
    value_dict = dict()
    count_dict = dict()
    
    for i from 0 <= i < length by 1:
        y, x = ys[i], xs[i]
        value_dict[x] = value_dict.setdefault(x, 0) + y
        count_dict[x] = count_dict.setdefault(x, 0) + 1

    for i from 0 <= i < length by 1:
        y, x = ys[i], xs[i]
        result[i] = (value_dict[x] - y) / (count_dict[x] - 1)
        
    return result

In [155]:
%timeit target_mean_v5a_cython(data, 'y', 'x')
%timeit target_mean_v5b_cython(data, 'y', 'x')

1.31 ms ± 37.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.3 ms ± 23.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## 使用 memoryview 用数组代替dict（加快了）

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5c_cython(data, str y_name, str x_name):
    cdef int length = data.shape[0]
    cdef double[:] result = np.zeros(length)
    cdef long[:] xs = data[x_name].values
    cdef long[:] ys = data[y_name].values
    cdef long[:] value = np.zeros(10).astype(long)
    cdef long[:] count = np.zeros(10).astype(long)
    
    for i from 0 <= i < length by 1:
        y, x = ys[i], xs[i]
        value[x] += y
        count[x] += 1

    for i from 0 <= i < length by 1:
        y, x = ys[i], xs[i]
        result[i] = (value[x] - y) / (count[x] - 1)
        
    return result

In [158]:
%timeit target_mean_v5c_cython(data, 'y', 'x')

747 µs ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## 使用prange代替for循环（加快了）

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5d_cython(data, str y_name, str x_name):
    cdef int length = data.shape[0]
    cdef double[:] result = np.zeros(length)
    cdef long[:] xs = data[x_name].values
    cdef long[:] ys = data[y_name].values
    cdef long[:] value = np.zeros(10).astype(long)
    cdef long[:] count = np.zeros(10).astype(long)
    
    for i in prange(length, nogil=True):
        y, x = ys[i], xs[i]
        value[x] += y
        count[x] += 1

    for i from 0 <= i < length by 1:
        y, x = ys[i], xs[i]
        result[i] = (value[x] - y) / (count[x] - 1)
        
    return result

## 以上代码 prange nogil 赋值报错，修改如下

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5d_cython(data, str y_name, str x_name):
    cdef int length = data.shape[0]
    cdef double[:] result = np.zeros(length)
    cdef long[:] xs = data[x_name].values
    cdef long[:] ys = data[y_name].values
    cdef long[:] value = np.zeros(10).astype(long)
    cdef long[:] count = np.zeros(10).astype(long)
    
    cdef int i = 0
    for i in prange(length, nogil=True):
        value[xs[i]] += ys[i]
        count[xs[i]] += 1

    for i in prange(length, nogil=True):
        result[i] = (value[xs[i]] - ys[i]) / (count[xs[i]] - 1)

    return result

In [165]:
%timeit target_mean_v5d_cython(data, 'y', 'x')

53.8 µs ± 650 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [189]:
# 讲义的 v2 版本
%timeit target_mean_v2(data, 'y', 'x')

# python 原生最快的版本
%timeit target_mean_v4(data, 'y', 'x')

# cython 最快的版本
%timeit target_mean_v5d_cython(data, 'y', 'x')

376 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
4.34 ms ± 319 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
56.7 µs ± 2.09 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# 4. 总结

python 原生优化：

* 使用简单的数据结构，数组取数据比DataFrame 快
* 使用简单的算法，自己实现的比自带的 groupby快, setdefault 比 defaultdict 快

Cython 优化:

* 指定变量类型
* 使用 memoryview
* 使用 prange代替for循环