In [1]:
import numpy as np
import pandas as pd

# numpy索引相关

## 多行混合索引(当每行取得元素个数相同时)

比如你有一个二维数组a，你有一个索引序列数组a_idx,len(a)==len(a_idx), a_idx.shape[1] = k,表示每行取k个元素，a_idx的每一行的值表示要取的对应的列索引，每行列索引都不一样；这样的索引我称为多行混合索引，无法直接使用a[a_idx]

In [2]:
a = np.array([[1.2, 1.4, 1.12, 2.3], [2.1, 2.12, 1.56, 1.74], [3.23, 2.12, 4.23, 2.34]])

In [3]:
a

array([[1.2 , 1.4 , 1.12, 2.3 ],
       [2.1 , 2.12, 1.56, 1.74],
       [3.23, 2.12, 4.23, 2.34]])

In [90]:
k = 3  # 每行取得元素个数必须相同，否则无法直接构造成数组
a_idx = np.array([[0,3,2], [1,2,3], [0,1,2]])  # 想取数组a第一行的0,3,2元素，第二行的1，2，3元素，第三行的0,1,2元素

In [92]:
a[
    np.repeat(np.arange(len(a_idx)), k),
    a_idx.ravel()].reshape(len(a_idx), k)

array([[1.2 , 2.3 , 1.12],
       [2.12, 1.56, 1.74],
       [3.23, 2.12, 4.23]])

# Parallel Processing in Python

In [8]:
import multiprocessing as mp


In [4]:
np.random.RandomState(100)
arr = np.random.randint(0, 10, size=[2000000, 5])
data = arr.tolist()

In [5]:
def howmany_within_range(row, minimum, maximum):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    count = 0
    for n in row:
        if minimum <= n <= maximum:
            count = count + 1
    return count

In [6]:
results = []
for row in data:
    results.append(howmany_within_range(row, minimum=4, maximum=8))


In [11]:
mp.cpu_count() // 2

4

In [12]:

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count()// 2)

# Step 2: `pool.apply` the `howmany_within_range()`
results = [pool.apply(howmany_within_range, args=(row, 4, 8)) for row in data]

# Step 3: Don't forget to close
pool.close()   

Process ForkPoolWorker-2:
Process ForkPoolWorker-4:
Process ForkPoolWorker-1:
Process ForkPoolWorker-3:
Traceback (most recent call last):


# 计算样本间距离并只选出最小的k个距离点

- distance.pdist：计算n维空间X中样本间的两两(成对)距离。 参数：X, metric
- distance.cdist：计算X_A和X_B之间的两两(成对)距离。 参数：XA, XB, metric

- np.partition: 对所给数组按找给定位置进行分割，返回分割后的数组。参数： 给定数组a，及位置索引kth

比如指定kth=10，则表示先确定所给数组第10小的数字为n，则要求返回的数组满足这些条件：n位于第10个位置，前10个元素的值必须小于n，n之后的元素必须大于n，两部分内部的顺序不作要求;kth可以为负数，如-3，则表示按照数组a中第3的元素对a进行分割。

其应用场景为：比如我们仅想从一个很大的数组里找到最大的10个值，如果先对元素进行排序，再取前10个元素，这样的代价会比较大；考虑到只需前10个，则可以用np.partition

In [2]:
from scipy.spatial import distance

In [3]:
nsamples = 10005
nfeatures = 20
X = np.random.randn(nsamples, nfeatures)

In [4]:
njobs = 20

In [5]:
step = int(np.ceil(nsamples / njobs))

In [6]:
step

501

In [7]:
X.shape

(10005, 20)

In [18]:
i = 0
st = i*step
end = (i+1)*step

In [21]:
w = distance.cdist(XA=X[st:end], XB=X, metric="euclidean")

In [22]:
w.shape

(501, 10005)

In [23]:
w

array([[0.        , 6.50226702, 5.68503918, ..., 5.73421475, 6.97135613,
        7.77232142],
       [6.50226702, 0.        , 6.13201294, ..., 4.75273047, 5.3941752 ,
        6.91599551],
       [5.68503918, 6.13201294, 0.        , ..., 6.5574655 , 7.70145862,
        7.57157916],
       ...,
       [7.97397333, 5.47986608, 7.40401986, ..., 6.44619024, 5.85732901,
        7.56291792],
       [5.81804948, 5.90674852, 7.48309627, ..., 7.08748461, 7.17639955,
        8.90564595],
       [7.46202109, 6.09174844, 5.36317628, ..., 7.34503315, 7.60248785,
        8.04963534]])

In [94]:
k = 10
kths = tuple(np.arange(1, k+1))

In [95]:
z = np.zeros((nsamples, k))

In [96]:
pairs = np.zeros_like(z)

In [97]:
pairs.shape

(10005, 10)

In [98]:
z.shape

(10005, 10)

In [99]:
w.shape

(501, 10005)

In [100]:
w_parted_ix = np.argpartition(w, kths, axis=1)

In [101]:
w_parted_ix

array([[    0,  6838,  2352, ..., 10002, 10003, 10004],
       [    1,  4545,  2952, ..., 10002, 10003, 10004],
       [    2,   621,  3207, ..., 10002, 10003, 10004],
       ...,
       [  498,  8628,  5028, ..., 10002, 10003, 10004],
       [  499,  9416,  8833, ..., 10002, 10003, 10004],
       [  500,  3399,  1255, ..., 10002, 10003, 10004]])

In [102]:
w_parted_ix[:, 1:k+1].shape

(501, 10)

In [103]:
z[st:end, :] = w_parted_ix[:, 1:k+1]

In [104]:
z[0]

array([6838., 2352., 6091., 6865., 1683., 4217., 6686., 2412., 8069.,
       1844.])

In [105]:
ixs_rows = np.repeat(np.arange(len(w)), k)
ixs_cols = tuple(w_parted_ix[:, 1:k+1].ravel())

In [106]:
pairs[st:end, :] = w[ixs_rows, ixs_cols].reshape(len(w), k)