# ndarray对象的内部机理

In [1]:
import numpy as np

In [2]:
np.ones((10, 5)).shape

(10, 5)

In [3]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

In [4]:
ints = np.ones(10, dtype=np.uint16)

In [5]:
ints

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=uint16)

In [6]:
floats = np.ones(10, dtype=np.float32)
floats

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [7]:
np.issubdtype(ints.dtype, np.integer)

True

In [8]:
np.issubdtype(floats.dtype, np.floating)

True

In [9]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [10]:
np.issubdtype(ints.dtype, np.number)

True

# ⾼级数组操作

In [12]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [13]:
arr.reshape((4, 2))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [14]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [15]:
arr = np.arange(15)
arr

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

作为参数的形状的其中⼀维可以是－1，它表示该维度的⼤⼩由 数据本身推断⽽来

In [16]:
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [17]:
arr = np.arange(15).reshape((5, 3))
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [18]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [19]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [20]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [24]:
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [23]:
arr.ravel('A')

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [22]:
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

## 数组的合并和拆分

In [25]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr1

array([[1, 2, 3],
       [4, 5, 6]])

In [26]:
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
arr2

array([[ 7,  8,  9],
       [10, 11, 12]])

In [27]:
np.concatenate([arr1, arr2])

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [28]:
np.concatenate([arr1, arr2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [29]:
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [31]:
np.vstack((arr1,arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [32]:
np.hstack((arr1, arr2))

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [33]:
arr = np.random.randn(5, 2)
arr

array([[-0.12826987, -0.2271997 ],
       [ 0.48707655, -1.10080005],
       [-1.36707951,  0.10806486],
       [ 0.26392506,  0.71557431],
       [ 0.22338446, -0.7981275 ]])

In [34]:
first, second, third = np.split(arr, [1, 3])

In [35]:
first

array([[-0.12826987, -0.2271997 ]])

In [36]:
second

array([[ 0.48707655, -1.10080005],
       [-1.36707951,  0.10806486]])

In [37]:
third

array([[ 0.26392506,  0.71557431],
       [ 0.22338446, -0.7981275 ]])

## 堆叠辅助类：r_和c_

In [38]:
arr = np.arange(6)
arr

array([0, 1, 2, 3, 4, 5])

In [39]:
arr1 = arr.reshape((3, 2))
arr1

array([[0, 1],
       [2, 3],
       [4, 5]])

In [40]:
arr2 = np.random.randn(3, 2)
arr2

array([[ 0.68883224,  1.48628249],
       [-0.85299541,  0.31797119],
       [ 0.30042028, -2.19949925]])

In [41]:
np.r_[arr1, arr2]

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [ 0.68883224,  1.48628249],
       [-0.85299541,  0.31797119],
       [ 0.30042028, -2.19949925]])

In [42]:
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.68883224,  1.48628249,  3.        ],
       [-0.85299541,  0.31797119,  4.        ],
       [ 0.30042028, -2.19949925,  5.        ]])

In [43]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

## 元素的重复操作：tile和repeat

In [44]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [45]:
arr.repeat(3)

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [46]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [49]:
arr = np.random.randn(2, 2)
arr

array([[-2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ]])

In [50]:
arr.repeat(2, axis=0)

array([[-2.22785843, -0.56407295],
       [-2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ],
       [ 2.51396859, -0.051778  ]])

In [51]:
arr.repeat(2)

array([-2.22785843, -2.22785843, -0.56407295, -0.56407295,  2.51396859,
        2.51396859, -0.051778  , -0.051778  ])

In [52]:
arr.repeat(2, axis=1)

array([[-2.22785843, -2.22785843, -0.56407295, -0.56407295],
       [ 2.51396859,  2.51396859, -0.051778  , -0.051778  ]])

In [53]:
arr

array([[-2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ]])

tile的功能是沿指定轴向堆叠数组的副本。

In [54]:
np.tile(arr, 2)

array([[-2.22785843, -0.56407295, -2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ,  2.51396859, -0.051778  ]])

In [55]:
np.tile(arr, (2, 1))

array([[-2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ],
       [-2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ]])

In [56]:
np.tile(arr, (3, 2))

array([[-2.22785843, -0.56407295, -2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ,  2.51396859, -0.051778  ],
       [-2.22785843, -0.56407295, -2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ,  2.51396859, -0.051778  ],
       [-2.22785843, -0.56407295, -2.22785843, -0.56407295],
       [ 2.51396859, -0.051778  ,  2.51396859, -0.051778  ]])

In [57]:
np.tile(arr, (3, ))

array([[-2.22785843, -0.56407295, -2.22785843, -0.56407295, -2.22785843,
        -0.56407295],
       [ 2.51396859, -0.051778  ,  2.51396859, -0.051778  ,  2.51396859,
        -0.051778  ]])

## 花式索引的等价函数：take和put

In [58]:
arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [59]:
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [60]:
arr.take(inds)

array([700, 100, 200, 600])

In [62]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [63]:
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [64]:
inds = [2, 0, 2, 1]

In [65]:
arr = np.random.randn(2, 4)
arr

array([[-1.03992164, -1.25467994, -2.06825253,  0.55304585],
       [-0.59131878, -0.21076996,  0.85215665, -0.6156224 ]])

In [66]:
arr.take(inds, axis=1)

array([[-2.06825253, -1.03992164, -2.06825253, -1.25467994],
       [ 0.85215665, -0.59131878,  0.85215665, -0.21076996]])

# ⼴播

⼴播（broadcasting）指的是不同形状的数组之间的算术运算的 执⾏⽅式。它是⼀种⾮常强⼤的功能，但也容易令⼈误解，即使 是经验丰富的⽼⼿也是如此。将标量值跟数组合并时就会发⽣最 简单的⼴播：

In [67]:
arr = np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [68]:
arr * 4

array([ 0,  4,  8, 12, 16])

In [69]:
arr = np.random.randn(4, 3)
arr

array([[-0.98299675,  0.55787029, -1.63958454],
       [-0.09112319, -0.21678041, -0.01069486],
       [-0.62368591,  0.25562402,  0.5559968 ],
       [ 0.96102265,  0.14818499, -0.66714437]])

In [70]:
arr.mean(0)

array([-0.1841958 ,  0.18622472, -0.44035674])

In [71]:
arr.mean(1)

array([-0.688237  , -0.10619949,  0.06264497,  0.14735442])

In [72]:
demeaned = arr - arr.mean(0)
demeaned

array([[-0.79880095,  0.37164556, -1.1992278 ],
       [ 0.09307261, -0.40300513,  0.42966188],
       [-0.43949011,  0.0693993 ,  0.99635354],
       [ 1.14521845, -0.03803973, -0.22678763]])

In [73]:
demeaned.mean(0)

array([ 5.55111512e-17, -6.93889390e-18, -2.77555756e-17])

In [75]:
row_means = arr.mean(1)

In [76]:
row_means.shape

(4,)

In [77]:
row_means.reshape((4, 1))

array([[-0.688237  ],
       [-0.10619949],
       [ 0.06264497],
       [ 0.14735442]])

In [78]:
demeaned = arr - row_means.reshape((4, 1))
demeaned

array([[-2.94759748e-01,  1.24610729e+00, -9.51347538e-01],
       [ 1.50762963e-02, -1.10580924e-01,  9.55046278e-02],
       [-6.86330878e-01,  1.92979050e-01,  4.93351828e-01],
       [ 8.13668227e-01,  8.30568974e-04, -8.14498796e-01]])

In [79]:
demeaned.mean(1)

array([ 0.00000000e+00, -9.25185854e-18,  0.00000000e+00, -3.70074342e-17])

## 沿其它轴向⼴播

In [80]:
arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [81]:
arr - arr.mean(1).reshape((4, 1))

array([[-2.94759748e-01,  1.24610729e+00, -9.51347538e-01],
       [ 1.50762963e-02, -1.10580924e-01,  9.55046278e-02],
       [-6.86330878e-01,  1.92979050e-01,  4.93351828e-01],
       [ 8.13668227e-01,  8.30568974e-04, -8.14498796e-01]])

In [83]:
arr = np.zeros((4, 4))
arr

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [85]:
arr_3d = arr[:, np.newaxis, :]
arr_3d

array([[[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]],

       [[0., 0., 0., 0.]]])

In [86]:
arr_3d.shape

(4, 1, 4)

In [87]:
arr_1d = np.random.normal(size=3)
arr_1d

array([ 1.02679056, -1.01227901, -0.6605144 ])

In [88]:
arr_1d[:, np.newaxis]

array([[ 1.02679056],
       [-1.01227901],
       [-0.6605144 ]])

In [89]:
arr_1d[np.newaxis, :]

array([[ 1.02679056, -1.01227901, -0.6605144 ]])

In [91]:
arr = np.random.randn(3, 4, 5)
arr

array([[[-2.11878925, -0.5184341 , -0.00876998, -1.08417714,
         -0.16019441],
        [-0.26316321, -0.90136783, -0.97761982,  1.25504351,
          1.51833908],
        [ 0.75671442, -2.00737837, -0.11680876, -1.29527617,
          0.60372446],
        [ 0.63091492, -0.02169671,  0.17041219,  1.26247189,
          0.67276743]],

       [[ 0.08151435,  1.71495402,  0.79651979, -0.50496686,
          0.65178377],
        [-0.13874637, -1.63921048,  2.67056879, -0.12917871,
          0.24638555],
        [-0.91749974, -0.51498739, -0.73938786,  0.63352403,
         -1.45908366],
        [ 0.03551651, -2.3782175 , -0.87834998,  1.18652285,
          0.01758384]],

       [[ 0.07899745,  0.5318421 ,  0.29843254,  0.40227354,
          0.01331999],
        [-0.08030607,  0.87925082, -0.0318174 ,  0.33499197,
          1.73749682],
        [ 0.51643892,  0.30083109, -1.42275952,  1.52622942,
         -0.12105428],
        [-1.47383722, -0.01826851, -1.75398265, -2.27898209,
          1

In [93]:
depth_means = arr.mean(2)
depth_means

array([[-0.77807298,  0.12624635, -0.41180489,  0.54297395],
       [ 0.54796101,  0.20196375, -0.59948692, -0.40338885],
       [ 0.26497312,  0.56792323,  0.15993713, -0.88438506]])

In [94]:
depth_means.shape

(3, 4)

In [95]:
demeaned = arr - depth_means[:, :, np.newaxis]

In [96]:
demeaned.mean(2)

array([[-2.22044605e-17,  0.00000000e+00,  4.44089210e-17,
         6.66133815e-17],
       [ 2.22044605e-17,  6.10622664e-17,  6.66133815e-17,
         2.22044605e-17],
       [-1.11022302e-17,  0.00000000e+00,  4.44089210e-17,
        -8.88178420e-17]])

In [97]:
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    # This generalizes things like [:, :, np.newaxis] to N dim 
    indexer = [slice(None)] * arr.ndim 
    indexer[axis] = np.newaxis 
    return arr - means[indexer]

In [98]:
demean_axis(arr, axis=0)

  


array([[[-1.4660301 , -1.09455477, -0.37083076, -0.68855366,
         -0.32849752],
        [-0.10242466, -0.347592  , -1.53133034,  0.76809125,
          0.35093193],
        [ 0.63816322, -1.26686681,  0.64284329, -1.58343526,
          0.92919562],
        [ 0.90005019,  0.7843642 ,  0.99105233,  1.20580101,
          0.07493528]],

       [[ 0.7342735 ,  1.13883335,  0.43445901, -0.10934337,
          0.48348065],
        [ 0.02199218, -1.08543465,  2.11685826, -0.61613097,
         -0.9210216 ],
        [-1.03605094,  0.22552417,  0.02026418,  0.34536494,
         -1.1336125 ],
        [ 0.30465177, -1.57215659, -0.05770983,  1.12985196,
         -0.58024831]],

       [[ 0.7317566 , -0.04427857, -0.06362824,  0.79789703,
         -0.15498313],
        [ 0.08043248,  1.43302665, -0.58552793, -0.15196029,
          0.57008967],
        [ 0.39788772,  1.04134265, -0.66310747,  1.23807033,
          0.20441688],
        [-1.20470196,  0.78779239, -0.93334251, -2.33565297,
          0

## 通过⼴播设置数组的值

In [103]:
arr = np.zeros((4, 3))

In [104]:
arr[:] = 5

In [105]:
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [106]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]

In [107]:
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [108]:
col[:, np.newaxis]

array([[ 1.28],
       [-0.42],
       [ 0.44],
       [ 1.6 ]])

In [109]:
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

# ufunc⾼级应⽤

## ufunc实例⽅法

In [110]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

⽤np.add.reduce对数组 中各个元素进⾏求和

In [None]:
 np.add.reduce(arr)

In [112]:
arr.sum()

45

⽤ np.logical_and检查数组各⾏中的值是否是有序的

In [113]:
np.random.seed(12346)

In [114]:
arr = np.random.randn(5, 5)

In [117]:
arr[::2].sort(1)

In [118]:
arr

array([[-9.81497953e-01, -8.99822478e-02,  3.65775545e-01,
         7.48336101e-01,  7.59372617e-01],
       [-3.15442628e-01, -8.66135605e-01,  2.78568155e-02,
        -4.55597723e-01, -1.60189223e+00],
       [-8.48730755e-01, -5.46459347e-01, -3.21536673e-01,
         4.60468309e-04,  2.48256116e-01],
       [ 2.53915229e-01,  1.93684246e+00, -7.99504902e-01,
        -5.69159281e-01,  4.89244731e-02],
       [-9.53521432e-01, -6.49092950e-01, -4.79535727e-01,
         1.75403128e-01,  1.42253882e+00]])

In [119]:
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])

In [121]:
np.array([1,2,3,4])[:-1]

array([1, 2, 3])

In [125]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis =1)

array([ True, False,  True, False,  True])

In [126]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [127]:
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

outer⽤于计算两个数组的叉积

In [128]:
arr = np.arange(3).repeat([1, 2, 2])
arr

array([0, 1, 1, 2, 2])

In [129]:
np.multiply.outer(arr, np.arange(5))

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [130]:
np.multiply.outer(np.arange(5), arr)

array([[0, 0, 0, 0, 0],
       [0, 1, 1, 2, 2],
       [0, 2, 2, 4, 4],
       [0, 3, 3, 6, 6],
       [0, 4, 4, 8, 8]])

In [131]:
x, y = np.random.randn(3, 4), np.random.randn(5)

In [132]:
x

array([[ 0.52241759,  0.10641604,  0.10271336, -0.10823261],
       [ 0.05485992,  0.1963653 , -0.19387262, -1.45657748],
       [ 0.85744762, -0.74157558, -0.78036253, -0.1064245 ]])

In [133]:
y

array([ 0.59371272, -1.28346227,  0.47796048,  1.29244703,  0.1516492 ])

In [134]:
result = np.subtract.outer(x, y)
result

array([[[-0.07129513,  1.80587986,  0.04445711, -0.77002945,
          0.37076839],
        [-0.48729668,  1.38987831, -0.37154444, -1.18603099,
         -0.04523316],
        [-0.49099936,  1.38617563, -0.37524711, -1.18973367,
         -0.04893584],
        [-0.70194533,  1.17522966, -0.58619309, -1.40067964,
         -0.25988181]],

       [[-0.5388528 ,  1.33832219, -0.42310056, -1.23758711,
         -0.09678928],
        [-0.39734742,  1.47982757, -0.28159518, -1.09608173,
          0.0447161 ],
        [-0.78758534,  1.08958965, -0.6718331 , -1.48631965,
         -0.34552182],
        [-2.05029021, -0.17311522, -1.93453796, -2.74902452,
         -1.60822669]],

       [[ 0.2637349 ,  2.14090989,  0.37948715, -0.43499941,
          0.70579842],
        [-1.3352883 ,  0.54188669, -1.21953606, -2.03402261,
         -0.89322478],
        [-1.37407525,  0.50309974, -1.25832301, -2.07280956,
         -0.93201173],
        [-0.70013722,  1.17703777, -0.58438498, -1.39887153,
         -0

In [135]:
result.shape

(3, 4, 5)

In [136]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

In [138]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [139]:
arr = np.multiply.outer(np.arange(4), np.arange(5))

In [140]:
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

In [141]:
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

## 编写新的ufunc

In [142]:
def add_elements(x, y):
    return x + y

In [148]:
add_them = np.frompyfunc(add_elements, 2, 1)

In [149]:
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [150]:
add_them = np.vectorize(add_elements, otypes=[np.float])

In [151]:
add_them(np.arange(8), np.arange(8))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [152]:
arr = np.random.randn(10000)

In [153]:
%timeit add_them(arr, arr)

2.64 ms ± 721 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [154]:
%timeit np.add(arr, arr)

4.36 µs ± 760 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


# 结构化和记录式数组

In [155]:
dtype = [('x', np.float64), ('y', np.int32)]

In [156]:
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)

In [157]:
sarr

array([(1.5       ,  6), (3.14159265, -2)],
      dtype=[('x', '<f8'), ('y', '<i4')])

In [158]:
sarr[0]

(1.5, 6)

In [159]:
sarr[0]['y']

6

In [160]:
sarr['x']

array([1.5       , 3.14159265])

## 嵌套dtype和多维字段

In [161]:
dtype = [('x', np.int64, 3), ('y', np.int32)]

In [162]:
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [163]:
arr[0]['x']

array([0, 0, 0])

In [164]:
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [165]:
arr['y']

array([0, 0, 0, 0], dtype=int32)

In [166]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]

In [167]:
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)

In [168]:
data

array([((1., 2.), 5), ((3., 4.), 6)],
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

In [169]:
data['x']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

In [170]:
data['y']

array([5, 6], dtype=int32)

In [171]:
data['x']['a']

array([1., 3.])

# 更多有关排序的话题

In [172]:
arr = np.random.randn(6)
arr

array([ 1.13969136,  1.28881614,  0.37588273, -1.08199644,  1.84126094,
        0.80139193])

In [173]:
arr.sort()

In [174]:
arr

array([-1.08199644,  0.37588273,  0.80139193,  1.13969136,  1.28881614,
        1.84126094])

In [175]:
arr = np.random.randn(3, 5)
arr

array([[-0.33176812, -1.47108206,  0.87050269, -0.08468875, -1.13286962],
       [-1.01114869, -0.34357617,  2.17140268,  0.12337075, -0.01893118],
       [ 0.17731791,  0.7423957 ,  0.85475634,  1.03797268, -0.32899594]])

In [176]:
arr[:, 0].sort() # Sort first column values in-plac

In [177]:
arr

array([[-1.01114869, -1.47108206,  0.87050269, -0.08468875, -1.13286962],
       [-0.33176812, -0.34357617,  2.17140268,  0.12337075, -0.01893118],
       [ 0.17731791,  0.7423957 ,  0.85475634,  1.03797268, -0.32899594]])

In [178]:
arr = np.random.randn(5)
arr

array([-1.11807759, -0.24152521, -2.0051193 ,  0.73788753, -1.06137462])

In [179]:
np.sort(arr)

array([-2.0051193 , -1.11807759, -1.06137462, -0.24152521,  0.73788753])

In [180]:
arr

array([-1.11807759, -0.24152521, -2.0051193 ,  0.73788753, -1.06137462])

In [181]:
arr = np.random.randn(3, 5)
arr

array([[ 0.59545348, -0.26822958,  1.33885804, -0.18715572,  0.91108374],
       [-0.32150045,  1.00543901, -0.51683937,  1.19251887, -0.19893404],
       [ 0.39691349, -1.76381537,  0.60709023, -0.22215536, -0.21707838]])

In [182]:
arr.sort(axis=1)

In [183]:
arr

array([[-0.26822958, -0.18715572,  0.59545348,  0.91108374,  1.33885804],
       [-0.51683937, -0.32150045, -0.19893404,  1.00543901,  1.19251887],
       [-1.76381537, -0.22215536, -0.21707838,  0.39691349,  0.60709023]])

In [184]:
arr[:, ::-1]

array([[ 1.33885804,  0.91108374,  0.59545348, -0.18715572, -0.26822958],
       [ 1.19251887,  1.00543901, -0.19893404, -0.32150045, -0.51683937],
       [ 0.60709023,  0.39691349, -0.21707838, -0.22215536, -1.76381537]])

In [185]:
[1,2,3][::-1]

[3, 2, 1]

## 间接排序：argsort和lexsort

In [186]:
values = np.array([5, 0, 1, 3, 2])

In [187]:
indexer = values.argsort()
indexer

array([1, 2, 4, 3, 0])

In [188]:
values[indexer]

array([0, 1, 2, 3, 5])

In [189]:
arr = np.random.randn(3, 5)
arr

array([[-1.21357483, -0.87044607, -0.2305542 ,  1.04376344, -1.14410284],
       [-0.36360302, -0.13775933,  2.17773731, -0.47280687,  0.8356152 ],
       [-0.20885016,  0.23159352,  0.72798172, -1.3918432 ,  1.99558262]])

In [190]:
arr[0] = values
arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [-0.36360302, -0.13775933,  2.17773731, -0.47280687,  0.8356152 ],
       [-0.20885016,  0.23159352,  0.72798172, -1.3918432 ,  1.99558262]])

In [191]:
arr[:, arr[0].argsort()]

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [-0.13775933,  2.17773731,  0.8356152 , -0.47280687, -0.36360302],
       [ 0.23159352,  0.72798172,  1.99558262, -1.3918432 , -0.20885016]])

In [192]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill'])

In [193]:
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Gachi'])

In [194]:
sorter = np.lexsort((first_name, last_name))

In [195]:
sorter

array([1, 2, 3, 0])

In [196]:
zip(last_name[sorter], first_name[sorter])

<zip at 0x7f4bb2efccc8>

## 其他排序算法

In [197]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])

In [198]:
key = np.array([2, 2, 1, 1, 1])

In [199]:
indexer = key.argsort(kind='mergesort')

In [200]:
indexer

array([2, 3, 4, 0, 1])

In [201]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

## 部分排序数组

In [202]:
np.random.seed(12345)

In [203]:
arr = np.random.randn(20)
arr

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474,
        1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684,
        0.88642934, -2.00163731, -0.37184254,  1.66902531, -0.43856974])

In [204]:
np.partition(arr, 3)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

In [205]:
indices = np.argpartition(arr, 3)

In [206]:
indices

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])

In [207]:
arr.take(indices)

array([-2.00163731, -1.29622111, -0.5557303 , -0.51943872, -0.37184254,
       -0.43856974, -0.20470766,  0.28174615,  0.76902257,  0.47894334,
        1.00718936,  0.09290788,  0.27499163,  0.22891288,  1.35291684,
        0.88642934,  1.39340583,  1.96578057,  1.66902531,  1.24643474])

## numpy.searchsorted：在有序数组中查找元素

In [208]:
arr = np.array([0, 1, 7, 12, 15])

In [209]:
arr.searchsorted(9)

3

In [213]:
np.insert(arr, 3, 9)

array([ 0,  1,  7,  9, 12, 15])

In [210]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5])

In [214]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])

In [215]:
arr.searchsorted([0, 1])

array([0, 3])

In [216]:
arr.searchsorted([0, 1], side='right')

array([3, 7])

In [217]:
data = np.floor(np.random.uniform(0, 10000, size=50))

In [218]:
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

In [219]:
bins = np.array([0, 100, 1000, 5000, 10000])

In [220]:
labels = bins.searchsorted(data)

In [221]:
labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])

In [224]:
import pandas as pd
pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

# ⽤Numba编写快速NumPy函数

In [225]:
import numpy as np

def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [227]:
x = np.random.randn(10000000)

In [228]:
y = np.random.randn(10000000)

In [230]:
%time mean_distance(x, y)

CPU times: user 6.44 s, sys: 4.43 ms, total: 6.44 s
Wall time: 6.64 s


0.00030647566526243147

In [231]:
%timeit (x - y).mean()

118 ms ± 9.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [233]:
import numba as nb

In [234]:
numba_mean_distance = nb.jit(mean_distance)

In [235]:
@nb.jit
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [236]:
%timeit numba_mean_distance(x, y)

23.7 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [237]:
from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def mean_distance(x, y):
    return (x - y).mean()

In [241]:
%timeit numba_mean_distance(x, y)

20.9 ms ± 59.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## ⽤Numba创建⾃定义numpy.ufunc对象

In [242]:
from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y

In [243]:
x = np.arange(10)
nb_add(x, x)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [246]:
nb_add.accumulate(x, 0)

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

# ⾼级数组输⼊输出

## 内存映像⽂件

In [248]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))

In [249]:
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [250]:
section = mmap[:5]

In [251]:
section[:] = np.random.randn(5, 10000)

In [252]:
mmap.flush()

In [253]:
mmap

memmap([[ 0.56603952,  0.43116575, -0.82120347, ..., -0.55973364,
         -0.15669814, -2.70481814],
        [ 0.62161543, -0.1767397 ,  1.24016792, ..., -0.45129708,
         -0.60809984, -0.4414712 ],
        [ 0.11724405,  0.15035116, -0.19596831, ...,  1.2193065 ,
         -0.25665271,  1.23090493],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

In [254]:
del mmap

In [255]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))

In [256]:
mmap

memmap([[ 0.56603952,  0.43116575, -0.82120347, ..., -0.55973364,
         -0.15669814, -2.70481814],
        [ 0.62161543, -0.1767397 ,  1.24016792, ..., -0.45129708,
         -0.60809984, -0.4414712 ],
        [ 0.11724405,  0.15035116, -0.19596831, ...,  1.2193065 ,
         -0.25665271,  1.23090493],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

# 性能建议

## 连续内存的重要性

In [257]:
arr_c = np.ones((1000, 1000), order='C')

In [258]:
arr_f = np.ones((1000, 1000), order='F')

In [259]:
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [260]:
arr_f.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [261]:
arr_f.flags.f_contiguous

True

In [262]:
%timeit arr_c.sum(1)

473 µs ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [263]:
%timeit arr_f.sum(1)

576 µs ± 69.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [264]:
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [265]:
arr_c[:50].flags.contiguous

True

In [266]:
arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [270]:
!cmd

/usr/bin/sh: cmd: command not found
