In [1]:
import numpy as np

In [2]:
# 1、导入numpy作为np，并查看版本
np.__version__

'1.15.4'

In [3]:
# 2、如何创建一维数组？
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
# 3. 如何创建一个布尔数组？
np.full((3, 3), True, dtype=bool)

# Alternate method:
np.ones((3,3), dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [5]:
# 4. 如何从一维数组中提取满足指定条件的元素？
# input
arr = np.array([0,1,2,3,4,5,6,7,8,9])

# Solution
arr[arr % 2 == 1]

array([1, 3, 5, 7, 9])

In [6]:
# 5. 如何用numpy数组中的另一个值替换满足条件的元素项？
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[arr % 2 == 1] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [7]:
# 6. 如何在不影响原始数组的情况下替换满足条件的元素项？
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

out = np.where(arr % 2 == 1, -1, arr)
print(arr)
out

[0 1 2 3 4 5 6 7 8 9]


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [8]:
# 7. 如何改变数组的形状？
# input
arr = np.arange(10)

arr.reshape(2, -1)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [9]:
# 8. 如何垂直叠加两个数组？
# input 
a = np.arange(10).reshape(2, -1)
b = np.repeat(1, 10).reshape(2, -1)

np.concatenate([a, b], axis=1)

# np.hstack([a, b])

# np.c_[a, b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [10]:
# 10. 如何在无硬编码的情况下生成numpy中的自定义序列？
# input 
a = np.array([1,2,3])

np.r_[np.repeat(a,3), np.tile(a,3)]

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

In [11]:
# 11. 如何获取两个numpy数组之间的公共项？
# input
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a, b)

array([2, 4])

In [12]:
# 12. 如何从一个数组中删除存在于另一个数组中的项？
# input
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

np.setdiff1d(a, b)

array([1, 2, 3, 4])

In [13]:
# 13. 如何得到两个数组元素匹配的位置？
# input
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where(a == b)

(array([1, 3, 5, 7]),)

In [14]:
# 14. 如何从numpy数组中提取给定范围内的所有数字？
# input 
a = np.arange(15)

index = np.where((a >= 5) & (a <= 10))
a[index]

index = np.where(np.logical_and(a>=5, a<=10))
a[index]

a[(a >= 5) & (a <= 10)]

array([ 5,  6,  7,  8,  9, 10])

In [15]:
# 15. 如何创建一个python函数来处理scalars并在numpy数组上工作？
# input
def maxx(x, y):
    """
    Get the maximum of two items
    """
    if x >= y:
        return x
    else:
        return y
    
pair_max = np.vectorize(maxx, otypes=[float])
a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

pair_max(a, b)

array([6., 7., 9., 8., 9., 7., 5.])

In [16]:
# 16. 如何交换二维numpy数组中的两列？
# input
arr = np.arange(9).reshape(3,3)

arr[:, [1,0,2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [17]:
# 17. 如何交换二维numpy数组中的两行？
# input
arr = np.arange(9).reshape(3,3)

arr[[1,0,2],:]

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

In [18]:
# 18. 如何反转二维数组的行？
# input
arr = np.arange(9).reshape(3,3)

arr[::-1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [19]:
# 19. 如何反转二维数组的列？
# input
arr = np.arange(9).reshape(3,3)

arr[:, ::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [20]:
# 20. 如何创建包含5到10之间随机浮动的二维数组？
# input
arr = np.arange(9).reshape(3,3)

rand_arr = np.random.randint(low=5, high=10, size=(5,3)) + np.random.random((5,3))
print(rand_arr)

rand_arr = np.random.uniform(5,10, size=(5,3))
print(rand_arr)

[[5.83324485 9.11264899 5.62102214]
 [8.90329542 7.92795401 5.09063659]
 [7.07766163 7.5416475  5.20600689]
 [5.7473502  6.92799137 5.10984294]
 [9.76542382 9.39143882 8.12676812]]
[[8.34973364 8.61858126 5.62465369]
 [9.56098291 8.44790994 6.05900427]
 [8.65649496 6.42933142 6.2398012 ]
 [6.53948937 5.10838042 7.92418224]
 [5.00120129 7.91160146 8.34387512]]


In [21]:
# 21. 如何在numpy数组中只打印小数点后三位？
# input
rand_arr = np.random.random((5,3))

np.set_printoptions(precision=3)
rand_arr[:4]

array([[0.604, 0.135, 0.014],
       [0.188, 0.149, 0.094],
       [0.718, 0.716, 0.387],
       [0.861, 0.927, 0.074]])

In [22]:
# 22. 如何通过e式科学记数法（如1e10）来打印一个numpy数组？
# input
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr

#np.set_printoptions(suppress=False)
#rand_arr = np.random.random([3,3])/1e3
#rand_arr

#np.set_printoptions(suppress=True, precision=6)
# rand_arr

array([[5.434e-04, 2.784e-04, 4.245e-04],
       [8.448e-04, 4.719e-06, 1.216e-04],
       [6.707e-04, 8.259e-04, 1.367e-04]])

In [23]:
# 23. 如何限制numpy数组输出中打印的项目数？
# input
a = np.arange(15)

np.set_printoptions(threshold=6)
a = np.arange(15)
a

array([ 0,  1,  2, ..., 12, 13, 14])

In [24]:
# 24. 打印完整的numpy数组a而不截断。
# input 
np.set_printoptions(threshold=6)
a = np.arange(15)

np.set_printoptions(threshold=np.nan)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [25]:
# 25. 如何导入数字和文本的数据集保持文本在numpy数组中完好无损？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')


iris[:3]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

In [26]:
# 26. 如何从1维元组数组中提取特定列？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None)
print(iris_1d.shape)

species = np.array([row[4] for row in iris_1d])
species[:5]

(150,)


  after removing the cwd from sys.path.


array([b'Iris-setosa', b'Iris-setosa', b'Iris-setosa', b'Iris-setosa',
       b'Iris-setosa'], dtype='|S18')

In [28]:
# 27. 如何将1维元组数组转换为2维numpy数组？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None)

iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
iris_2d[:4]

iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[:4]

  after removing the cwd from sys.path.


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

In [31]:
# 28. 如何计算numpy数组的均值，中位数，标准差？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

mu, med, sd = np.mean(sepallength), np.mean(sepallength), np.std(sepallength)
print(mu, med, sd)

5.843333333333334 5.843333333333334 0.8253012917851409


In [32]:
# 29. 如何规范化数组，使数组的值正好介于0和1之间？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

Smax, Smin = sepallength.max(), sepallength.min()
S = (sepallength - Smin)/(Smax - Smin)

S = (sepallength -Smin)/sepallength.ptp()
S

array([0.222, 0.167, 0.111, 0.083, 0.194, 0.306, 0.083, 0.194, 0.028,
       0.167, 0.306, 0.139, 0.139, 0.   , 0.417, 0.389, 0.306, 0.222,
       0.389, 0.222, 0.306, 0.222, 0.083, 0.222, 0.139, 0.194, 0.194,
       0.25 , 0.25 , 0.111, 0.139, 0.306, 0.25 , 0.333, 0.167, 0.194,
       0.333, 0.167, 0.028, 0.222, 0.194, 0.056, 0.028, 0.194, 0.222,
       0.139, 0.222, 0.083, 0.278, 0.194, 0.75 , 0.583, 0.722, 0.333,
       0.611, 0.389, 0.556, 0.167, 0.639, 0.25 , 0.194, 0.444, 0.472,
       0.5  , 0.361, 0.667, 0.361, 0.417, 0.528, 0.361, 0.444, 0.5  ,
       0.556, 0.5  , 0.583, 0.639, 0.694, 0.667, 0.472, 0.389, 0.333,
       0.333, 0.417, 0.472, 0.306, 0.472, 0.667, 0.556, 0.361, 0.333,
       0.333, 0.5  , 0.417, 0.194, 0.361, 0.389, 0.389, 0.528, 0.222,
       0.389, 0.556, 0.417, 0.778, 0.556, 0.611, 0.917, 0.167, 0.833,
       0.667, 0.806, 0.611, 0.583, 0.694, 0.389, 0.417, 0.583, 0.611,
       0.944, 0.944, 0.472, 0.722, 0.361, 0.944, 0.556, 0.667, 0.806,
       0.528, 0.5  ,

In [33]:
# 30. 如何计算Softmax得分？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.array([float(row[0]) for row in iris])

def softmax(x):
    """
    softmax
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

print(softmax(sepallength))

[0.002 0.002 0.001 0.001 0.002 0.003 0.001 0.002 0.001 0.002 0.003 0.002
 0.002 0.001 0.004 0.004 0.003 0.002 0.004 0.002 0.003 0.002 0.001 0.002
 0.002 0.002 0.002 0.002 0.002 0.001 0.002 0.003 0.002 0.003 0.002 0.002
 0.003 0.002 0.001 0.002 0.002 0.001 0.001 0.002 0.002 0.002 0.002 0.001
 0.003 0.002 0.015 0.008 0.013 0.003 0.009 0.004 0.007 0.002 0.01  0.002
 0.002 0.005 0.005 0.006 0.004 0.011 0.004 0.004 0.007 0.004 0.005 0.006
 0.007 0.006 0.008 0.01  0.012 0.011 0.005 0.004 0.003 0.003 0.004 0.005
 0.003 0.005 0.011 0.007 0.004 0.003 0.003 0.006 0.004 0.002 0.004 0.004
 0.004 0.007 0.002 0.004 0.007 0.004 0.016 0.007 0.009 0.027 0.002 0.02
 0.011 0.018 0.009 0.008 0.012 0.004 0.004 0.008 0.009 0.03  0.03  0.005
 0.013 0.004 0.03  0.007 0.011 0.018 0.007 0.006 0.008 0.018 0.022 0.037
 0.008 0.007 0.006 0.03  0.007 0.008 0.005 0.013 0.011 0.013 0.004 0.012
 0.011 0.011 0.007 0.009 0.007 0.005]


In [34]:
# 31. 如何找到numpy数组的百分位数？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

np.percentile(sepallength, q=[5, 95])

array([4.6  , 7.255])

In [36]:
# 32. 如何在数组中的随机位置插入值？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')

i, j = np.where(iris_2d)
np.random.seed(100)
iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan

np.random.seed(100)
iris_2d[np.random.choice(150, size=20), np.random.randint(4, size=20)] = np.nan

print(iris_2d[:10])

[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'5.0' b'3.6' b'1.4' b'0.2' b'Iris-setosa']
 [b'5.4' b'3.9' b'1.7' b'0.4' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'5.0' b'3.4' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.4' nan b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]


In [37]:
# 33. 如何在numpy数组中找到缺失值的位置？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float')
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print('Number of missing values: \n', np.isnan(iris_2d[:, 0]).sum())
print('Position of missing values: \n', np.where(np.isnan(iris_2d[:, 0])))

Number of missing values: 
 5
Position of missing values: 
 (array([ 38,  80, 106, 113, 121]),)


In [38]:
# 34. 如何根据两个或多个条件过滤numpy数组？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

condition = (iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0)
iris_2d[condition]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

In [40]:
# 35. 如何从numpy数组中删除包含缺失值的行？
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

any_nan_in_row = np.array([~np.any(row) for row in iris_2d])
iris_2d[any_nan_in_row][:5]

iris_2d[np.sum(np.isnan(iris_2d), axis=1) == 0][:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [45]:
# 36. 如何找到numpy数组的两列之间的相关性？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

# Solution 1
np.corrcoef(iris[:, 0], iris[:, 2])[0, 1]

# Solution 2
from scipy.stats.stats import pearsonr  
corr, p_value = pearsonr(iris[:, 0], iris[:, 2])
print(corr)

0.8717541573048712


In [46]:
# 37. 如何查找给定数组是否具有任何空值？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.isnan(iris_2d).any()

False

In [47]:
# 38. 如何在numpy数组中用0替换所有缺失值？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

iris_2d[np.isnan(iris_2d)] = 0
iris_2d[:4]

array([[0. , 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

In [48]:
# 39. 如何在numpy数组中查找唯一值的计数？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

species = np.array([row.tolist()[4] for row in iris])
np.unique(species, return_counts=False)

array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype='|S15')

In [49]:
# 40. 如何将数字转换为分类（文本）数组？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]
petal_length_cat[:4]

['small', 'small', 'small', 'small']

In [50]:
# 41. 如何从numpy数组的现有列创建新列？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

sepallength = iris_2d[:, 0].astype('float')
petallength = iris_2d[:, 2].astype('float')
volume = (np.pi * petallength * (sepallength ** 2))/3

volume = volume[:, np.newaxis]

out = np.hstack([iris_2d, volume])
out[:4]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa',
        38.13265162927291],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa',
        35.200498485922445],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa',
        33.238050274980004]], dtype=object)

In [51]:
# 42. 如何在numpy中进行概率抽样？
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

species = iris[:, 4]

np.random.seed(100)
a = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])

np.random.seed(100)
probs = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, .750, num=50), np.linspace(.751, 1.0, num=50)]
index = np.searchsorted(probs, np.random.random(150))
species_out = species[index]
print(np.unique(species_out, return_counts=True))

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
      dtype=object), array([77, 37, 36]))


In [52]:
# 43. 如何在按另一个数组分组时获取数组的第二大值？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')
np.unique(np.sort(petal_len_setosa))[-2]

1.7

In [53]:
# 44. 如何按列对2D数组进行排序
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

print(iris[iris[:, 0].argsort()][:20])

[[b'4.3' b'3.0' b'1.1' b'0.1' b'Iris-setosa']
 [b'4.4' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'3.0' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.5' b'2.3' b'1.3' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.6' b'1.0' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.2' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.1' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.9' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.1' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.9' b'2.4' b'3.3' b'1.0' b'Iris-versicolor']
 [b'4.9' b'2.5' b'4.5' b'1.7' b'Iris-virginica']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]


In [54]:
# 45. 如何在numpy数组中找到最常见的值？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

vals, counts = np.unique(iris[:, 2],return_counts=True)
print(vals[np.argmax(counts)])

b'1.5'


In [55]:
# 46. 如何找到第一次出现的值大于给定值的位置？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

np.argwhere(iris[:, 3].astype(float) > 1.0)[0]

array([50])

In [58]:
# 47. 如何将大于给定值的所有值替换为给定的截止值？
# Input
np.set_printoptions(precision=2)
np.random.seed(100)
a = np.random.uniform(1,50, 20)

np.clip(a, a_min=10, a_max=30)
print(np.where(a < 10, 10, np.where(a > 30, 30, a)))

[27.63 14.64 21.8  30.   10.   10.   30.   30.   10.   29.18 30.   11.25
 10.08 10.   11.77 30.   30.   10.   30.   14.43]


In [60]:
# 48. 如何从numpy数组中获取最大n值的位置？
# input
np.random.seed(100)
a = np.random.uniform(1,50, 20)

print(a.argsort())
np.argpartition(-a, 5)[:5]

a[a.argsort()][-5:]

np.sort(a)[-5:]
np.partition(a, kth=-5)[-5:]

a[np.argpartition(-a, 5)][:5]

[ 4 13  5  8 17 12 11 14 19  1  2  0  9  6 16 18  7  3 10 15]


array([48.95, 44.67, 42.39, 41.47, 41.  ])

In [61]:
# 49. 如何计算数组中所有可能值的行数？
# input
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
arr

def counts_of_all_values_rowwise(arr2d):
    num_counts_array = [np.unique(row, return_counts=True) for row in arr2d]
    return ([[int(b[a==i]) if i in a else 0 for i in np.unique(arr2d)] for a, b in num_counts_array])

print(np.arange(1, 11))
counts_of_all_values_rowwise(arr)

arr = np.array([np.array(list('bill clinton')), np.array(list('narendramodi')), np.array(list('jjayalalitha'))])
print(np.unique(arr))
counts_of_all_values_rowwise(arr)

[ 1  2  3  4  5  6  7  8  9 10]
[' ' 'a' 'b' 'c' 'd' 'e' 'h' 'i' 'j' 'l' 'm' 'n' 'o' 'r' 't' 'y']


[[1, 0, 1, 1, 0, 0, 0, 2, 0, 3, 0, 2, 1, 0, 1, 0],
 [0, 2, 0, 0, 2, 1, 0, 1, 0, 0, 1, 2, 1, 2, 0, 0],
 [0, 4, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 1, 1]]

In [62]:
# 50. 如何将数组转换为平面一维数组？
# input
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.array([arr1, arr2, arr3])
print('array_of_arrays', array_of_arrays)

arr_2d = np.array([a for arr in array_of_arrays for a in arr])

arr_2d = np.concatenate(array_of_arrays)
print(arr_2d)

array_of_arrays [array([0, 1, 2]) array([3, 4, 5, 6]) array([7, 8, 9])]
[0 1 2 3 4 5 6 7 8 9]


In [64]:
# 51. 如何在numpy中为数组生成单热编码？
# input
np.random.seed(101) 
arr = np.random.randint(1,4, size=6)
arr

def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0]))
    
    for i, k in enumerate(arr):
        out[i, k - 1] = 1
        
    return out

one_hot_encodings(arr)

(arr[:, None] == np.unique(arr)).view(np.int8)

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 0, 0]], dtype=int8)

In [65]:
# 52. 如何创建按分类变量分组的行号？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
species_small

print([i for val in np.unique(species_small) for i, grp in enumerate(species_small[species_small==val])])

[0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5]


In [68]:
# 53. 如何根据给定的分类变量创建组ID？
# input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
species_small = np.sort(np.random.choice(species, size=20))
species_small

output = [np.argwhere(np.unique(species_small) == s).tolist()[0][0] for val in np.unique(species_small) for s in species_small[species_small==val]]

output = []
uniqs = np.unique(species_small)

for val in uniqs:
    for s in species_small[species_small==val]:
        groupid = np.argwhere(uniqs == s).tolist()[0][0]  # groupid
        output.append(groupid)
        
print(output)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2]


In [69]:
# 54. 如何使用numpy对数组中的项进行排名？
# input
np.random.seed(10)
a = np.random.randint(20, size=10)
print('Array: ', a)

print(a.argsort().argsort())
print('Array: ', a)

Array:  [ 9  4 15  0 17 16 17  8  9  0]
[4 2 6 0 8 7 9 3 5 1]
Array:  [ 9  4 15  0 17 16 17  8  9  0]


In [70]:
# 55. 如何使用numpy对多维数组中的项进行排名？
# input
np.random.seed(10)
a = np.random.randint(20, size=[2,5])
print(a)

print(a.ravel().argsort().argsort().reshape(a.shape))

[[ 9  4 15  0 17]
 [16 17  8  9  0]]
[[4 2 6 0 8]
 [7 9 3 5 1]]


In [71]:
# 56. 如何在二维numpy数组的每一行中找到最大值？
# input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

np.amax(a, axis=1)

np.apply_along_axis(np.max, arr=a, axis=1)

array([9, 8, 6, 3, 9])

In [72]:
# 57. 如何计算二维numpy数组每行的最小值？
# input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)

array([0.44, 0.12, 0.5 , 1.  , 0.11])

In [74]:
# 58. 如何在numpy数组中找到重复的记录？
# Input
np.random.seed(100)
a = np.random.randint(0, 5, 10)
print('Array: ', a)

out = np.full(a.shape[0], True)
unique_positions = np.unique(a, return_index=True)[1]

out[unique_positions] = False
print(out)

Array:  [0 0 3 0 2 4 2 2 2 2]
[False  True False  True False False  True  True  True  True]


In [77]:
# 59. 如何找出数字的分组均值？
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

numeric_column = iris[:, 1].astype('float')
grouping_column = iris[:, 4]

[[group_val, numeric_column[grouping_column==group_val].mean()] for group_val in np.unique(grouping_column)]

output = []
for group_val in np.unique(grouping_column):
    output.append([group_val, numeric_column[grouping_column==group_val].mean()])
    
output

[[b'Iris-setosa', 3.418],
 [b'Iris-versicolor', 2.7700000000000005],
 [b'Iris-virginica', 2.974]]

In [79]:
# 60. 如何将PIL图像转换为numpy数组？
# input
from io import BytesIO
from PIL import Image
import PIL, requests

# Import image from URL
URL = 'https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg'
response = requests.get(URL)

I = Image.open(BytesIO(response.content))
I = I.resize([150, 150])
arr = np.array(I)

im = PIL.Image.fromarray(np.uint8(arr))
Image.Image.show(im)

In [80]:
# 61. 如何删除numpy数组中所有缺少的值？
# input
np.array([1,2,3,np.nan,5,6,7,np.nan])

a[~np.isnan(a)]

array([0, 0, 3, 0, 2, 4, 2, 2, 2, 2])

In [81]:
# 62. 如何计算两个数组之间的欧氏距离？
# Input
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

dist = np.linalg.norm(a - b)
dist

6.708203932499369

In [82]:
# 63. 如何在一维数组中找到所有的局部极大值(或峰值)？
# input
a = np.array([1, 3, 7, 1, 2, 6, 0, 1])

doublediff = np.diff(np.sign(np.diff(a)))
peak_locations = np.where(doublediff == -2)[0] + 1
peak_locations

array([2, 5])

In [84]:
# 64. 如何从二维数组中减去一维数组，其中一维数组的每一项从各自的行中减去？
# input
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
b_1d = np.array([1,1,1])
                
print(a_2d - b_1d[:, None])

[[2 2 2]
 [3 3 3]
 [4 4 4]]


In [86]:
# 65. 如何查找数组中项的第n次重复索引？
# input
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])
n = 5

[i for i, v in enumerate(x) if v == 1][n-1]


np.where(x == 1)[0][n - 1]

8

In [88]:
# 66. 如何将numpy的datetime 64对象转换为datetime的datetime对象？
# input
dt64 = np.datetime64('2018-02-25 22:10:10')

from datetime import datetime
dt64.tolist()

dt64.astype(datetime)

datetime.datetime(2018, 2, 25, 22, 10, 10)

In [89]:
# 67. 如何计算numpy数组的移动平均值？
# input 
np.random.seed(100)
Z = np.random.randint(10, size=10)
print('array: ', Z)

def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] -ret[:-n]
    
    return ret[n - 1:] / n

moving_average(Z, n=3).round(3)

np.convolve(Z, np.ones(3)/3, mode='valid')

array:  [8 8 3 7 7 0 4 2 5 2]


array([6.33, 6.  , 5.67, 4.67, 3.67, 2.  , 3.67, 3.  ])

In [90]:
# 68. 如何在给定起始点、长度和步骤的情况下创建一个numpy数组序列？
# input 
length = 10
start = 5
step = 3

def seq(start, length, step):
    end = start + (step*length)
    return np.arange(start, end, step)

seq(start, length, step)

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32])

In [91]:
# 69. 如何填写不规则系列的numpy日期中的缺失日期？
# Input
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-25'), 2)
print(dates)

filled_in = np.array([np.arange(date, (date+d)) for date, d in zip(dates, np.diff(dates))]).reshape(-1)

output = np.hstack([filled_in, dates[-1]])
output

out = []
for date, d in zip(dates, np.diff(dates)):
    out.append(np.arange(date, (date+d)))

filled_in = np.array(out).reshape(-1)

output = np.hstack([filled_in, dates[-1]])
output

['2018-02-01' '2018-02-03' '2018-02-05' '2018-02-07' '2018-02-09'
 '2018-02-11' '2018-02-13' '2018-02-15' '2018-02-17' '2018-02-19'
 '2018-02-21' '2018-02-23']


array(['2018-02-01', '2018-02-02', '2018-02-03', '2018-02-04',
       '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08',
       '2018-02-09', '2018-02-10', '2018-02-11', '2018-02-12',
       '2018-02-13', '2018-02-14', '2018-02-15', '2018-02-16',
       '2018-02-17', '2018-02-18', '2018-02-19', '2018-02-20',
       '2018-02-21', '2018-02-22', '2018-02-23'], dtype='datetime64[D]')

In [92]:
# 70. 如何从给定的一维数组创建步长？
# input
arr = np.arange(15) 
arr

def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    # return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])
    return np.array([a[s:(s+window_len)] for s in np.arange(0, n_strides*stride_len, stride_len)])

print(gen_strides(np.arange(15), stride_len=2, window_len=4))

[[ 0  1  2  3]
 [ 2  3  4  5]
 [ 4  5  6  7]
 [ 6  7  8  9]
 [ 8  9 10 11]
 [10 11 12 13]]
