# Numpy高级应用

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy.random import randn

#### 数组复制叠加

In [11]:
arr=randn(2,2)
arr

array([[-0.57625449,  0.01942968],
       [ 0.54055189, -1.54720183]])

In [12]:
arr.repeat(2,axis=0)

array([[-0.57625449,  0.01942968],
       [-0.57625449,  0.01942968],
       [ 0.54055189, -1.54720183],
       [ 0.54055189, -1.54720183]])

In [14]:
arr.repeat([2,3],axis=0)

array([[-0.57625449,  0.01942968],
       [-0.57625449,  0.01942968],
       [ 0.54055189, -1.54720183],
       [ 0.54055189, -1.54720183],
       [ 0.54055189, -1.54720183]])

In [15]:
arr.repeat(2)

array([-0.57625449, -0.57625449,  0.01942968,  0.01942968,  0.54055189,
        0.54055189, -1.54720183, -1.54720183])

In [16]:
arr.repeat([2,3],axis=1)

array([[-0.57625449, -0.57625449,  0.01942968,  0.01942968,  0.01942968],
       [ 0.54055189,  0.54055189, -1.54720183, -1.54720183, -1.54720183]])

In [18]:
np.tile(arr,(3,2))  #将数组作为一个整体移动

array([[-0.57625449,  0.01942968, -0.57625449,  0.01942968],
       [ 0.54055189, -1.54720183,  0.54055189, -1.54720183],
       [-0.57625449,  0.01942968, -0.57625449,  0.01942968],
       [ 0.54055189, -1.54720183,  0.54055189, -1.54720183],
       [-0.57625449,  0.01942968, -0.57625449,  0.01942968],
       [ 0.54055189, -1.54720183,  0.54055189, -1.54720183]])

### take and put

In [19]:
inds=[2,0,2,1]
arr=randn(2,4)
arr

array([[-1.25403406,  0.49059126,  1.00190769,  1.10482713],
       [-0.18896958, -1.24589712,  0.03654999,  1.72485541]])

In [20]:
arr.take(inds,axis=1)     #取的是2列 0列 2列 1列

array([[ 1.00190769, -1.25403406,  1.00190769,  0.49059126],
       [ 0.03654999, -0.18896958,  0.03654999, -1.24589712]])

In [25]:
inds1=[2,1,0,4]   #在put中没有axis的选项，数组按照行的顺序排列
arr.put(inds1,[12,15,13,17])
arr

array([[ 13.        ,  15.        ,  12.        ,   1.10482713],
       [ 17.        ,  -1.24589712,   0.03654999,   1.72485541]])

In [26]:
arr=randn(3,4,5)
depth_means=arr.mean(1)
depth_means

array([[-0.03100104, -0.02248211, -0.72654355,  0.19408075, -0.24708476],
       [ 0.11599559,  0.40004972,  0.13404769, -0.02805885, -0.2715806 ],
       [ 0.22356581,  0.48585634,  0.16572306, -0.51409381, -0.07059746]])

In [28]:
demeaned=arr-depth_means[:,np.newaxis,:]
demeaned.mean(1)

array([[  2.77555756e-17,   2.77555756e-17,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   2.77555756e-17,   0.00000000e+00,
         -1.38777878e-17,   1.38777878e-17],
       [  5.55111512e-17,  -6.93889390e-18,   2.77555756e-17,
         -1.38777878e-17,   0.00000000e+00]])

In [33]:
arr1=np.zeros((4,3))
col=np.array([1.28,-0.42,0.44,1.6])
arr1[:]=col[:,np.newaxis]   # np.newaxis相当于是用来复制的轴，其他的值填入':'指示处
arr1

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

### 结构化和记录式数组

In [38]:
dtype=[('x',np.float64),('y',np.int32)]   #相当于元组里元素的类型
#元组(1.5,6):名为'x'，值为1.5，类型为np.float64.  名为'y', 值为6，类型为np.int32. 
sarr=np.array([(1.5,6),(np.pi,-2),(4.25,3)],dtype=dtype)
sarr

array([( 1.5       ,  6), ( 3.14159265, -2), ( 4.25      ,  3)],
      dtype=[('x', '<f8'), ('y', '<i4')])

In [35]:
sarr[0]

( 1.5, 6)

In [36]:
sarr[0]['y']

6

In [37]:
sarr['x']

array([ 1.5       ,  3.14159265,  4.25      ])

### 稳定排序算法

In [16]:
values=np.array(['2:first','2:second','1:first','1:second','1:third'])
key=[]
for i in values:
    key.append(i.split(':')[0])
key1=np.array(key)
indexer=key1.argsort(kind='mergesort')
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='|S8')

In [29]:
key=[]
for i in values:
    key.append(int(i.split(':')[0]))    # int()将字符串转化为数据
key

[2, 2, 1, 1, 1]

In [19]:
data=np.floor(np.random.uniform(0,10000,size=50))
bins=np.array([0,100,1000,5000,10000])
data

array([ 4318.,  8422.,   522.,  7091.,  8440.,   349.,  8511.,  3601.,
        7287.,  7413.,  6667.,   541.,  7405.,   492.,  1777.,  6777.,
        7390.,  6306.,  1044.,  1632.,  9237.,  6441.,  6979.,   504.,
        6243.,  2074.,  2426.,  1323.,  4724.,  6083.,  8843.,  1144.,
        6001.,   183.,  5985.,  8849.,  7288.,  5012.,  9223.,  5541.,
        9766.,  9659.,  2482.,  4982.,  4348.,   393.,  2466.,  2069.,
         822.,  9271.])

In [20]:
labels=bins.searchsorted(data)
labels

array([3, 4, 2, 4, 4, 2, 4, 3, 4, 4, 4, 2, 4, 2, 3, 4, 4, 4, 3, 3, 4, 4, 4,
       2, 4, 3, 3, 3, 3, 4, 4, 3, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2,
       3, 3, 2, 4], dtype=int64)

In [22]:
pd.Series(data).groupby(labels).size()

2     8
3    15
4    27
dtype: int64

### 行存储与列存储

In [31]:
arr_c=np.ones((1000,1000),order='C')   #按行存储
arr_f=np.ones((1000,1000),order='F')
arr_c.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [32]:
%timeit arr_c.sum(1)    #按道理来说是这个好些

1000 loops, best of 3: 1.32 ms per loop


In [34]:
%timeit arr_f.sum(1)

1000 loops, best of 3: 790 µs per loop


In [35]:
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False