In [6]:
import numpy as np
import pandas as pd


提前导入line_profiler用于分析python代码

In [7]:
pip install line_profiler

Collecting line_profiler
[?25l  Downloading https://files.pythonhosted.org/packages/66/eb/417ace64f45fee7a0394946f8e1f90f925420fd9b14f1f09abb5284a0ca4/line_profiler-3.1.0-cp36-cp36m-manylinux2010_x86_64.whl (63kB)
[K     |█████▏                          | 10kB 18.0MB/s eta 0:00:01[K     |██████████▎                     | 20kB 20.2MB/s eta 0:00:01[K     |███████████████▍                | 30kB 11.9MB/s eta 0:00:01[K     |████████████████████▌           | 40kB 9.1MB/s eta 0:00:01[K     |█████████████████████████▋      | 51kB 4.3MB/s eta 0:00:01[K     |██████████████████████████████▊ | 61kB 4.8MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.7MB/s 
Installing collected packages: line-profiler
Successfully installed line-profiler-3.1.0


In [8]:
x = np.random.randint(10,size=(5000,1))
y = np.random.randint(2,size=(5000,1))
data = pd.DataFrame(np.concatenate([y,x],axis=1),columns=['y', 'x'])

In [9]:
def target_mean_v1(data, y_name, x_name):
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result



In [10]:
%%timeit
target_mean_v1(data,'y','x')

1 loop, best of 3: 23.1 s per loop


花的时间太多了，下面利用profiler分析每一行语句所用的时间

In [11]:
from line_profiler import LineProfiler 
profile = LineProfiler(target_mean_v1)
profile.enable()
target_mean_v1(data,'y','x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 47.9949 s
File: <ipython-input-9-ce8f8ebb5d69>
Function: target_mean_v1 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v1(data, y_name, x_name):
     2         1        668.0    668.0      0.0    result = np.zeros(data.shape[0])
     3      5001       9920.0      2.0      0.0    for i in range(data.shape[0]):
     4      5000   40392904.0   8078.6     84.2      groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
     5      5000    7591399.0   1518.3     15.8      result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
     6         1          1.0      1.0      0.0    return result



上图可见最严重的就是group_by,因为每个group_by都是一个循环遍历，接下来我们试着去掉groupby

In [22]:
def target_mean_v2(data: pd.DataFrame, y_name: str, x_name: str) ->np.ndarray:
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()
  for i in range(data.shape[0]):
    if data.loc[i, x_name] not in value_dict.keys():
      value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] = 1
    else:
      value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] += 1
  for i in range(data.shape[0]):
    result[i]=(value_dict[data.loc[i,x_name]] - data.loc[i, y_name])/(count_dict[data.loc[i, x_name]]-1)

In [24]:
%%timeit
target_mean_v2(data,'y','x')

1 loop, best of 3: 259 ms per loop


可以看到我们有了几百倍的速度提升，接下来我们继续分析它的代码


In [25]:
from line_profiler import LineProfiler
profile = LineProfiler(target_mean_v2)
profile.enable()
target_mean_v2(data,'y','x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.700698 s
File: <ipython-input-22-602f1d6e82dd>
Function: target_mean_v2 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v2(data: pd.DataFrame, y_name: str, x_name: str) ->np.ndarray:
     2         1         58.0     58.0      0.0    result = np.zeros(data.shape[0])
     3         1          2.0      2.0      0.0    value_dict = dict()
     4         1          1.0      1.0      0.0    count_dict = dict()
     5      5001       2742.0      0.5      0.4    for i in range(data.shape[0]):
     6      5000     101959.0     20.4     14.6      if data.loc[i, x_name] not in value_dict.keys():
     7        10        485.0     48.5      0.1        value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
     8        10        243.0     24.3      0.0        count_dict[data.loc[i, x_name]] = 1
     9                                               else:
    10      4990     1

这里我们发现有data.loc的地方时间比较多，假设是它的锅，我们试着用变量一次代替它多次

In [26]:

def target_mean_v3(data:pd.DataFrame, y_name:str,x_name:str) ->np.ndarray:
  data_shape = data.shape[0]
  result=np.zeros(data_shape)
  value_dict=dict()
  count_dict=dict()
  for i in range(data_shape):
    data_loc_x = data.loc[i,x_name]
    data_loc_y = data.loc[i,y_name]
    if data_loc_x not in value_dict:
      value_dict[data_loc_x]= data_loc_y
      count_dict[data_loc_x] = 1
    else:
      value_dict[data_loc_x] += data_loc_y
      count_dict[data_loc_x] += 1
  for i in range(data_shape):
    data_loc_x = data.loc[i, x_name]
    data_loc_y = data.loc[i, y_name]
    result[i] = (value_dict[data_loc_x]- data_loc_y)/(count_dict[data_loc_x]-1)
  return result

In [27]:
%%timeit
target_mean_v3(data,'y','x')

10 loops, best of 3: 151 ms per loop


我们看到有了不到两倍的提升，接下来我们再来分析它每行的时间继续优化

In [28]:

profile = LineProfiler(target_mean_v3)
profile.enable()
target_mean_v3(data,'y','x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.418692 s
File: <ipython-input-26-95a93e4b3507>
Function: target_mean_v3 at line 2

Line #      Hits         Time  Per Hit   % Time  Line Contents
     2                                           def target_mean_v3(data:pd.DataFrame, y_name:str,x_name:str) ->np.ndarray:
     3         1         78.0     78.0      0.0    data_shape = data.shape[0]
     4         1        115.0    115.0      0.0    result=np.zeros(data_shape)
     5         1          2.0      2.0      0.0    value_dict=dict()
     6         1          1.0      1.0      0.0    count_dict=dict()
     7      5001       2497.0      0.5      0.6    for i in range(data_shape):
     8      5000     102890.0     20.6     24.6      data_loc_x = data.loc[i,x_name]
     9      5000     101681.0     20.3     24.3      data_loc_y = data.loc[i,y_name]
    10      5000       3291.0      0.7      0.8      if data_loc_x not in value_dict:
    11        10          6.0      0.6      0.0        value_dict

看着还是loc浪费时间，把他移除循环

In [29]:
def target_mean_v4(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
    data_shape = data.shape[0]
    result = np.zeros(data_shape)
    value_dict = dict()
    count_dict = dict()

    x_val_series = data.loc[:, x_name]
    y_val_series = data.loc[:, y_name]
    for i in range(data_shape):
        data_loc_x = x_val_series[i]
        data_loc_y = y_val_series[i]
        if data_loc_x not in value_dict:
            value_dict[data_loc_x] = data_loc_y
            count_dict[data_loc_x] = 1
        else:
            value_dict[data_loc_x] += data_loc_y
            count_dict[data_loc_x] += 1
    for i in range(data_shape):
        data_loc_x = x_val_series[i]
        data_loc_y = y_val_series[i]
        result[i] = (value_dict[data_loc_x] - data_loc_y) / (count_dict[data_loc_x] - 1)

    return result

In [30]:
%%timeit
target_mean_v4(data,'y','x')

10 loops, best of 3: 73.2 ms per loop


又有了两倍左右的提升，我们接下来继续分析它的代码

In [31]:
profile = LineProfiler(target_mean_v4)
profile.enable()
target_mean_v4(data,'y','x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.233046 s
File: <ipython-input-29-4dd6ebabfe45>
Function: target_mean_v4 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v4(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
     2         1         22.0     22.0      0.0      data_shape = data.shape[0]
     3         1         33.0     33.0      0.0      result = np.zeros(data_shape)
     4         1          2.0      2.0      0.0      value_dict = dict()
     5         1          1.0      1.0      0.0      count_dict = dict()
     6                                           
     7         1        171.0    171.0      0.1      x_val_series = data.loc[:, x_name]
     8         1         68.0     68.0      0.0      y_val_series = data.loc[:, y_name]
     9      5001       2535.0      0.5      1.1      for i in range(data_shape):
    10      5000      56028.0     11.2     24.0          data_loc_x = x_val_s

最占时间的是pandas,我们接下来把pandas转换成ndarray

In [32]:
def target_mean_v5(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
    data_shape = data.shape[0]
    result = np.zeros(data_shape)
    value_dict = dict()
    count_dict = dict()

    x_val_series = data.loc[:, x_name].values
    y_val_series = data.loc[:, y_name].values
    for i in range(data_shape):
        data_loc_x = x_val_series[i]
        data_loc_y = y_val_series[i]
        if data_loc_x not in value_dict:
            value_dict[data_loc_x] = data_loc_y
            count_dict[data_loc_x] = 1
        else:
            value_dict[data_loc_x] += data_loc_y
            count_dict[data_loc_x] += 1
    for i in range(data_shape):
        data_loc_x = x_val_series[i]
        data_loc_y = y_val_series[i]
        result[i] = (value_dict[data_loc_x] - data_loc_y) / (count_dict[data_loc_x] - 1)

    return result

In [33]:
%%timeit
target_mean_v5(data,'y','x')

100 loops, best of 3: 8.3 ms per loop


wow,有10倍左右的提升，真不戳，接下来看看继续分析

In [34]:
profile = LineProfiler(target_mean_v5)
profile.enable()
target_mean_v5(data,'y','x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.036717 s
File: <ipython-input-32-5f082e605143>
Function: target_mean_v5 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v5(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
     2         1         64.0     64.0      0.2      data_shape = data.shape[0]
     3         1        106.0    106.0      0.3      result = np.zeros(data_shape)
     4         1          2.0      2.0      0.0      value_dict = dict()
     5         1          1.0      1.0      0.0      count_dict = dict()
     6                                           
     7         1        300.0    300.0      0.8      x_val_series = data.loc[:, x_name].values
     8         1         75.0     75.0      0.2      y_val_series = data.loc[:, y_name].values
     9      5001       2633.0      0.5      7.2      for i in range(data_shape):
    10      5000       3738.0      0.7     10.2          data_l

这里去掉loc试试

In [35]:
def target_mean_v6(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
    data_shape = data.shape[0]
    result = np.zeros(data_shape)
    value_dict = dict()
    count_dict = dict()

    x_val_series = data[x_name].values
    y_val_series = data[y_name].values
    for i in range(data_shape):
        data_loc_x = x_val_series[i]
        data_loc_y = y_val_series[i]
        if data_loc_x not in value_dict:
            value_dict[data_loc_x] = data_loc_y
            count_dict[data_loc_x] = 1
        else:
            value_dict[data_loc_x] += data_loc_y
            count_dict[data_loc_x] += 1
    for i in range(data_shape):
        data_loc_x = x_val_series[i]
        data_loc_y = y_val_series[i]
        result[i] = (value_dict[data_loc_x] - data_loc_y) / (count_dict[data_loc_x] - 1)

    return result

In [36]:
%%timeit
target_mean_v6(data,'y','x')

100 loops, best of 3: 8.29 ms per loop


并没有多大的提升，继续分析

In [37]:
profile = LineProfiler(target_mean_v6)
profile.enable()
target_mean_v6(data,'y','x')
profile.disable()
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.037406 s
File: <ipython-input-35-7925976ae960>
Function: target_mean_v6 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v6(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
     2         1         27.0     27.0      0.1      data_shape = data.shape[0]
     3         1         39.0     39.0      0.1      result = np.zeros(data_shape)
     4         1          1.0      1.0      0.0      value_dict = dict()
     5         1          2.0      2.0      0.0      count_dict = dict()
     6                                           
     7         1         57.0     57.0      0.2      x_val_series = data[x_name].values
     8         1         17.0     17.0      0.0      y_val_series = data[y_name].values
     9      5001       2914.0      0.6      7.8      for i in range(data_shape):
    10      5000       4067.0      0.8     10.9          data_loc_x = x_val_s

在python中优化的差不多了，然后用cython做优化

In [40]:
%load_ext Cython

接下来的是测试python中tensor的类型

In [38]:
a = np.zeros((5,1))
a.dtype
b = data['y'].values
b.dtype 

dtype('int64')

遵循的原则是：
1.所有类型都有type
2.注意numpy的排序（行/列优先）
3.尽量使用C++自带的数据结构
4.所有内存分配都应该在python中完成，临时变量用C++类进行构建
5.通用方法:numpy—>C 

In [43]:
%%cython 
import numpy as np
cimport numpy as cnp
import pandas as pd
def target_mean_v7(data:pd.DataFrame, y_name:str, x_name:str) ->np.ndarray:
  cdef:
    int data_shape = data.shape[0]
    cnp.ndarray[cnp.float64_t] result= np.zeros(data_shape)
    dict value_dict = {}
    dict count_dict = {}
    cnp.ndarray[cnp.int64_t] x_val_series = data[x_name].values
    cnp.ndarray[cnp.int64_t] y_val_series = data[y_name].values
  for i in range(data_shape):
    data_loc_x = x_val_series[i]
    data_loc_y = y_val_series[i]
    if data_loc_x not in value_dict:
      value_dict[data_loc_x] = data_loc_y
      count_dict[data_loc_x] = 1
    else:
      value_dict[data_loc_x] += data_loc_y
      count_dict[data_loc_x] += 1
  for i in range(data_shape):
    data_loc_x = x_val_series[i]
    data_loc_y = y_val_series[i]
    result[i] = (value_dict[data_loc_x]-data_loc_y) / (count_dict[data_loc_x]-1)
  return result

    

In [44]:
%%timeit
target_mean_v7(data,'y','x')

1000 loops, best of 3: 1.05 ms per loop


有了8倍的飞跃，接下来我们用openmp 并行继续优化

In [63]:
%%cython 
import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython

from cython.parallel import prange
cpdef target_mean_v8(data,cnp.str y_name, cnp.str x_name):
  cdef:
    int data_shape = data.shape[0]
    double[:,] result = np.zeros(data_shape,dtype=np.float64)
    double[:,] value_dict = np.zeros(10,dtype=np.float64)
    double[:,] count_dict = np.zeros(10,dtype=np.float64)
    long[:,] x_val_array = data[x_name].values
    long[:,] y_val_array = data[y_name].values
    int i = 0
  for i in prange(data_shape, nogil=True):
    value_dict[x_val_array[i]] += y_val_array[i]
    count_dict[x_val_array[i]] += 1
  for i in prange(data_shape, nogil=True):
    result[i] = (value_dict[x_val_array[i]] - y_val_array[i])/(count_dict[x_val_array[i]] -1)

  return result



In [64]:
%%timeit
target_mean_v8(data,'y','x')

The slowest run took 7.24 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 61.2 µs per loop
