# ndarray

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## Create ndarray

### np.array()创建数组

In [11]:
# np.array()默认会复制数据,数据类型会自动判断,或者指定dtype
data1 = list(range(6))
arr1 = np.array(data1)
print(arr1)
arr1.shape

[0 1 2 3 4 5]


(6,)

In [8]:
# ndim会强制生成一个2维数组
arr3 = np.array([1, 2, 3, 4], ndmin=2)
print(arr3)
arr3.shape

[[1 2 3 4]]


(1, 4)

In [9]:
arr2 = np.array([[1, 2], [3, 4]])
print(arr2)
arr2.shape

[[1 2]
 [3 4]]


(2, 2)

### zeros,eye,ones创建特殊数组

In [14]:
np.zeros((2, 5))

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [15]:
np.eye(4)

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]])

In [18]:
np.eye(4, M=5)

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.]])

In [17]:
np.eye(4, k=1)

array([[ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.]])

In [22]:
np.ones((4, 4))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])

In [29]:
arr = np.arange(4).reshape(2, 2)
arr
arr1 = np.zeros_like(arr)
arr2 = np.ones_like(arr)

In [27]:
arr1

array([[0, 0],
       [0, 0]])

In [28]:
arr2

array([[1, 1],
       [1, 1]])

## Index and Slice

### index and fancy index

In [45]:
# 索引得到的只是原始数组的一个视图,默认不会复制数据
arr = np.arange(10)
arr[5:8] = 0
arr

array([0, 1, 2, 3, 4, 0, 0, 0, 8, 9])

In [46]:
arr1 = arr[5:8]
arr1[0] = 100
arr

array([  0,   1,   2,   3,   4, 100,   0,   0,   8,   9])

In [47]:
# 如果想要复制数据,可以用copy方法
arr2 = arr[:3].copy()
arr2[0] = 1000
arr2

array([1000,    1,    2])

In [48]:
arr

array([  0,   1,   2,   3,   4, 100,   0,   0,   8,   9])

In [49]:
# 二维情况比较简单,直接看3维的,索引得到的仍然是视图
# 注意到显示的时候按照第一个维度进行分离
arr3d = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]])
arr3d

array([[[ 0,  1,  2],
        [ 3,  4,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]]])

In [50]:
arr3d[0]

array([[0, 1, 2],
       [3, 4, 5]])

In [51]:
arr3d[0, 1]

array([3, 4, 5])

In [52]:
arr3d[0, 1] = 100

In [53]:
arr3d

array([[[  0,   1,   2],
        [100, 100, 100]],

       [[  6,   7,   8],
        [  9,  10,  11]]])

In [59]:
arr = np.zeros((8, 4))
for i in range(8):
    arr[i] = i
arr

array([[ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 2.,  2.,  2.,  2.],
       [ 3.,  3.,  3.,  3.],
       [ 4.,  4.,  4.,  4.],
       [ 5.,  5.,  5.,  5.],
       [ 6.,  6.,  6.,  6.],
       [ 7.,  7.,  7.,  7.]])

In [60]:
# fancy index
arr[[2, 6, 3, 1]]

array([[ 2.,  2.,  2.,  2.],
       [ 6.,  6.,  6.,  6.],
       [ 3.,  3.,  3.,  3.],
       [ 1.,  1.,  1.,  1.]])

In [61]:
arr[[2, 6]] = 100

In [62]:
arr

array([[   0.,    0.,    0.,    0.],
       [   1.,    1.,    1.,    1.],
       [ 100.,  100.,  100.,  100.],
       [   3.,    3.,    3.,    3.],
       [   4.,    4.,    4.,    4.],
       [   5.,    5.,    5.,    5.],
       [ 100.,  100.,  100.,  100.],
       [   7.,    7.,    7.,    7.]])

In [63]:
arr[[0, 2], [0, 2]]

array([   0.,  100.])

In [64]:
arr[[0, 2]][:, [1, 0]]

array([[   0.,    0.],
       [ 100.,  100.]])

### slice

In [55]:
arr2d = np.arange(6).reshape(2, 3)
arr2d

array([[0, 1, 2],
       [3, 4, 5]])

In [56]:
arr2d[:, :1]

array([[0],
       [3]])

In [57]:
# 切片得到的仍然是视图
arr2d[:, :1] = 100
arr2d

array([[100,   1,   2],
       [100,   4,   5]])

## 数组函数

### 常用的数组函数

In [68]:
arr2d

array([[100,   1,   2],
       [100,   4,   5]])

In [65]:
# 整个数组的最大值
np.max(arr2d)

100

In [66]:
# 列方向的最大值
np.max(arr2d, axis=0)

array([100,   4,   5])

In [67]:
# 行方向的最大值
np.max(arr2d, axis=1)

array([100, 100])

In [69]:
np.arange(1, 10, 2)

array([1, 3, 5, 7, 9])

In [72]:
np.linspace(1, 10, 4)

array([  1.,   4.,   7.,  10.])

In [73]:
# np.where():if-else三元表达式的numpy版本
arr = np.random.randn(4, 4)
# arr中大于0的部分赋值成10,小于0的数不变
np.where(arr > 0, 10, arr)

array([[ 10.        ,  -0.95408857,  10.        ,  10.        ],
       [ 10.        ,  10.        ,  10.        ,  -1.23678897],
       [ 10.        ,  -0.45400041,  10.        ,  -0.33845066],
       [ -0.14280762,  10.        ,  10.        ,  10.        ]])

In [74]:
arr

array([[ 0.45733926, -0.95408857,  0.43535392,  0.75568231],
       [ 0.36501333,  0.67859946,  0.83235844, -1.23678897],
       [ 1.06942869, -0.45400041,  0.09575849, -0.33845066],
       [-0.14280762,  0.21140051,  0.13608582,  0.48483319]])

### 集合函数

In [138]:
'''
np.unique() : 返回唯一元素并排序
np.intersect1d(x,y): 计算公共元素,并排序
np.union1d(x,y): 返回并集,并排序
np.in1d(x,y) or np.isin(x,y):判断x中的元素是否在y中
np.setdiff1d(x,y):集合的差(x-y)
np.setxor1d(x,y):集合的对称差,即在一个集合中,但不同时在两个集合中
'''
names = np.array(['bob', 'sam', 'tim', 'bom', 'bob', 'lisa'])
np.unique(names)

array(['bob', 'bom', 'lisa', 'sam', 'tim'],
      dtype='<U4')

In [139]:
np.isin(['bob', 'tin'], names)

array([ True, False], dtype=bool)

In [140]:
np.in1d(['bob', 'tim', 'sara'], names)

array([ True,  True, False], dtype=bool)

### 统计函数

In [76]:
arr = np.random.randn(5, 4)
arr

array([[-0.73553474,  0.32961233, -1.58095483,  0.62101528],
       [-0.91879857, -0.9607466 ,  0.08251044,  0.96356426],
       [ 2.6986766 ,  0.43501857,  0.6554008 , -1.11059838],
       [ 0.24974826,  1.1856663 , -0.5688816 ,  1.1572915 ],
       [-0.13660801,  0.76602241, -0.40102008, -0.21070226]])

In [79]:
# 若是不指定axis的值则默认计算整个数组的均值,其他如max,min,std,var,sum等类似
arr.mean()
arr.mean(axis=0)
arr.mean(axis=1)

array([-0.34146549, -0.20836762,  0.6696244 ,  0.50595611,  0.00442302])

In [84]:
# order='F'表示按照列填充,默认'C'是按照行填充
arr = np.arange(6).reshape(2, 3, order='F')
arr

array([[0, 2, 4],
       [1, 3, 5]])

In [87]:
# 不指定axis参数则默认将其按照行的顺序铺平再计算累计和向量
arr.cumsum()
arr.cumsum(axis=0)
arr.cumsum(axis=1)

array([[0, 2, 6],
       [1, 4, 9]])

In [90]:
np.argmax(arr)

5

In [91]:
np.argmax(arr, axis=0)

array([1, 1, 1])

In [117]:
arr = np.random.randn(4, 5)
arr

array([[-0.33159417, -1.41975614,  1.75923591,  1.27098487, -0.90032524],
       [ 0.74156333,  0.20027451,  1.64263718, -0.5055284 ,  0.41433201],
       [-1.40892905,  0.304323  ,  0.31909388, -0.59293636,  1.10794735],
       [-0.34156247, -1.11111177, -0.49413504, -0.7935205 ,  0.29320952]])

In [118]:
# default axis=-1,即沿着最后一个维度,本例中axis=1
# sort()方法是就地排序!!
arr.sort()
arr

array([[-1.41975614, -0.90032524, -0.33159417,  1.27098487,  1.75923591],
       [-0.5055284 ,  0.20027451,  0.41433201,  0.74156333,  1.64263718],
       [-1.40892905, -0.59293636,  0.304323  ,  0.31909388,  1.10794735],
       [-1.11111177, -0.7935205 , -0.49413504, -0.34156247,  0.29320952]])

In [127]:
arr = np.random.randn(4, 5)
arr

array([[-2.00771529,  0.45928217, -1.57692156,  0.77804265, -0.63103615],
       [ 0.61206763, -0.12628017, -0.3659494 , -0.72596732,  0.18497393],
       [-1.36400018, -0.1032458 , -2.22331759,  0.08509441,  0.05242501],
       [ 2.09128698,  0.58948889,  1.29780314,  0.47201643,  0.00903606]])

In [128]:
# np.sort()不会直接影响传入的数组
np.sort(arr)
arr

array([[-2.00771529,  0.45928217, -1.57692156,  0.77804265, -0.63103615],
       [ 0.61206763, -0.12628017, -0.3659494 , -0.72596732,  0.18497393],
       [-1.36400018, -0.1032458 , -2.22331759,  0.08509441,  0.05242501],
       [ 2.09128698,  0.58948889,  1.29780314,  0.47201643,  0.00903606]])

In [129]:
# array的降序实现
arr1 = -np.sort(-arr)
arr1

array([[ 0.77804265,  0.45928217, -0.63103615, -1.57692156, -2.00771529],
       [ 0.61206763,  0.18497393, -0.12628017, -0.3659494 , -0.72596732],
       [ 0.08509441,  0.05242501, -0.1032458 , -1.36400018, -2.22331759],
       [ 2.09128698,  1.29780314,  0.58948889,  0.47201643,  0.00903606]])

In [134]:
arr = np.random.randint(1, 10, 4)
arr

array([2, 1, 7, 7])

In [135]:
arr1 = arr.argsort()
arr1

array([1, 0, 2, 3])

In [136]:
# 和直接sort的结果相同
arr[arr1]

array([1, 2, 7, 7])

In [130]:
arr = np.random.randn(3, 4)
arr

array([[-0.89364406, -1.52916862, -0.53717987,  1.10982533],
       [-0.68980758,  0.71432994,  2.11066208, -1.11631244],
       [ 0.94580569,  0.42114013, -0.50864142, -0.31265415]])

In [131]:
arr.argsort(axis=0)

array([[0, 0, 0, 1],
       [1, 2, 2, 2],
       [2, 1, 1, 0]])

### 用于bool型数组的方法

In [99]:
arr = np.random.randn(5, 4)
(arr > 0).sum()

9

In [100]:
(arr > 2).any()

True

In [101]:
(arr > 2).any(axis=0)

array([False, False,  True, False], dtype=bool)

In [97]:
(arr > 2).all()

False

In [103]:
(arr > 2).all(axis=1)

array([False, False, False, False, False], dtype=bool)

# 保存和读入操作(略)

# 线性代数

In [142]:
x=np.arange(1,5).reshape(2,2)
y=np.arange(1,5).reshape(2,2)
x

array([[1, 2],
       [3, 4]])

In [143]:
# 矩阵乘法
x.dot(y)

array([[ 7, 10],
       [15, 22]])

In [148]:
'''
numpy.linalg中的函数:

diag
trace
inv
det
eig:特征值和特征向量
pinv:广义逆
qr,svd: QR分解,SVD分解

'''
arr = np.random.randn(3,3)
[w,v] = np.linalg.eig(arr)

In [149]:
# 特征值
w

array([ 0.00670897+0.93281541j,  0.00670897-0.93281541j,  1.78278955+0.j        ])

In [150]:
# 列向量为特征向量
v

array([[ 0.89536213+0.j        ,  0.89536213-0.j        ,  0.37230421+0.j        ],
       [ 0.02374356+0.19201687j,  0.02374356-0.19201687j, -0.73625122+0.j        ],
       [-0.11527942+0.38419145j, -0.11527942-0.38419145j,  0.56508735+0.j        ]])

# 随机数

In [204]:
'''
numpy.random模块主要函数:

seed
permutation : 返回一个序列的随机排列或一个随机排列的范围(操作的是原序列的副本)
shuffle: 对一个序列就地随机排序
rand: 均匀分布
randint: 上下限范围内随机取整数(下限包括,上限不包括)
randn: 标准正态分布
binomial: 二项分布
normal : 正态分布
beta 
chisquare: 卡方分布
gamma

'''
# 给定seed则随机数不再变化,有利于程序重现
# np.random.seed(10)

# 试试permutation
np.random.permutation(10)

array([8, 2, 5, 6, 3, 1, 0, 7, 4, 9])

In [205]:
np.random.permutation(['alpha','tim','bon','sara','chisa'])

array(['bon', 'chisa', 'sara', 'tim', 'alpha'],
      dtype='<U5')

In [206]:
arr = np.arange(9).reshape(3,3)
arr

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [207]:
# 对多维数组,permutation对第一个维度进行随机排列
np.random.permutation(arr)

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [208]:
names

array(['bob', 'sam', 'tim', 'bom', 'bob', 'lisa'],
      dtype='<U4')

In [211]:
# unlike permutation, shuffle对一个序列原地排序
np.random.shuffle(names)
names

array(['lisa', 'bom', 'bob', 'sam', 'bob', 'tim'],
      dtype='<U4')

In [212]:
# numpy模块的随机数支持比标准库的要好
from random import normalvariate
N = 1_000_000
%timeit samples=[normalvariate(0,1) for _ in range(N)]

1 loop, best of 3: 810 ms per loop


In [214]:
%timeit samples=np.random.normal(0,1,size=N)

10 loops, best of 3: 40.4 ms per loop
