In [5]:
# https://www.machinelearningplus.com/python/101-numpy-exercises-python/
# 1. Import numpy as np and see the version
import numpy as np
print(np.__version__)

1.16.5


In [6]:
#2. How to create a 1D array?
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
#3. How to create a boolean array?
np.full((3,3), True, dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [9]:
# 4. How to extract items that satisfy a given condition from 1D array?
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[arr % 2 == 1]

array([1, 3, 5, 7, 9])

In [15]:
# 5. How to replace items that satisfy a condition with another value in numpy array?
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[arr % 2 == 1] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [16]:
# 6. How to replace items that satisfy a condition without affecting the original array?
arr = np.arange(10)
out = np.where(arr % 2 == 1, -1, arr)
print(arr)
out

[0 1 2 3 4 5 6 7 8 9]


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

In [19]:
# 7. How to reshape an array?
arr = np.arange(10)
arr.reshape(2, -1)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [22]:
# 8. How to stack two arrays vertically?
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
print(a)
print(b)
np.vstack([a, b])

[[0 1 2 3 4]
 [5 6 7 8 9]]
[[1 1 1 1 1]
 [1 1 1 1 1]]


array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [25]:
# 9. How to stack two arrays horizontally?
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
print(a)
print(b)
np.hstack([a,b])

[[0 1 2 3 4]
 [5 6 7 8 9]]
[[1 1 1 1 1]
 [1 1 1 1 1]]


array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

In [31]:
# 10. How to generate custom sequences in numpy without hardcoding?
a = np.array([1,2,3])
print(a)
np.r_[np.repeat(a, 3), np.tile(a, 3)]

[1 2 3]


array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

In [32]:
# 11. How to get the common items between two python numpy arrays?
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
np.intersect1d(a,b)

array([2, 4])

In [34]:
# 12. How to remove from one array those items that exist in another?
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])
np.setdiff1d(a,b)

array([1, 2, 3, 4])

In [35]:
# 13. How to get the positions where elements of two arrays match?
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
np.where(a == b)

(array([1, 3, 5, 7], dtype=int64),)

In [36]:
# 14. How to extract all numbers between a given range from a numpy array?
a = np.arange(15)
a[(a >= 5) & (a <= 10)]

array([ 5,  6,  7,  8,  9, 10])

In [37]:
# 15. How to make a python function that handles scalars to work on numpy arrays?
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

pair_max = np.vectorize(maxx, otypes=[float])

a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

pair_max(a, b)

array([6., 7., 9., 8., 9., 7., 5.])

In [38]:
# 16. How to swap two columns in a 2d numpy array?
arr = np.arange(9).reshape(3,3)
print(arr)
arr[:, [1, 0, 2]]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

In [39]:
# 17. How to swap two rows in a 2d numpy array?
arr = np.arange(9).reshape(3,3)
print(arr)
arr[[1,0,2], :]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

In [41]:
# 18. How to reverse the rows of a 2D array?
arr = np.arange(9).reshape(3,3)
print(arr)
arr[::-1]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

In [42]:
# 19. How to reverse the columns of a 2D array?
arr = np.arange(9).reshape(3,3)
arr[:, ::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

In [44]:
# 20. How to create a 2D array containing random floats between 5 and 10?
arr = np.arange(9).reshape(3,3)
print(arr)
rand_arr = np.random.uniform(5,10, size=(5,3))
print(rand_arr)

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[6.51642676 5.63084075 7.20187521]
 [8.27878638 6.0110088  6.7162379 ]
 [7.36262295 6.3845837  5.96287598]
 [8.09288988 9.18912858 8.64176632]
 [8.0658007  8.54851532 9.93043444]]


In [45]:
# 21. How to print only 3 decimal places in python numpy array?
rand_arr = np.random.random((5,3))
np.set_printoptions(precision=3)
rand_arr[:4]

array([[0.475, 0.904, 0.622],
       [0.405, 0.156, 0.804],
       [0.617, 0.123, 0.422],
       [0.249, 0.773, 0.565]])

In [47]:
# 22. How to pretty print a numpy array by suppressing the scientific notation (like 1e10)?
np.set_printoptions(suppress=False)
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr
np.set_printoptions(suppress=True, precision=6) 
rand_arr

array([[0.000543, 0.000278, 0.000425],
       [0.000845, 0.000005, 0.000122],
       [0.000671, 0.000826, 0.000137]])

In [48]:
# 23. How to limit the number of items printed in output of numpy array?
np.set_printoptions(threshold=6)
a = np.arange(15)
a

array([ 0,  1,  2, ..., 12, 13, 14])

In [51]:
# 24. How to print the full numpy array without truncating
import sys
np.set_printoptions(threshold=6)
a = np.arange(15)
np.set_printoptions(threshold=sys.maxsize)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [53]:
# 25. How to import a dataset with numbers and texts keeping the text intact in python numpy?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
#print(iris)
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
iris[:3]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

In [61]:
# 26. How to extract a particular column from 1D array of tuples?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding=None)
print(iris_1d.shape)
species = np.array([row[4] for row in iris_1d])
species[:5]

(150,)


array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype='<U15')

In [62]:
# 27. How to convert a 1d array of tuples to a 2d numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding=None)
iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
iris_2d[:4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

In [63]:
# 28. How to compute the mean, median, standard deviation of a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mu, med, sd)

5.843333333333334 5.8 0.8253012917851409


In [65]:
# 29. How to normalize an array so the values range exactly between 0 and 1?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
Smax, Smin = sepallength.max(), sepallength.min()
S = (sepallength - Smin)/(Smax - Smin)
print(S)

[0.222222 0.166667 0.111111 0.083333 0.194444 0.305556 0.083333 0.194444
 0.027778 0.166667 0.305556 0.138889 0.138889 0.       0.416667 0.388889
 0.305556 0.222222 0.388889 0.222222 0.305556 0.222222 0.083333 0.222222
 0.138889 0.194444 0.194444 0.25     0.25     0.111111 0.138889 0.305556
 0.25     0.333333 0.166667 0.194444 0.333333 0.166667 0.027778 0.222222
 0.194444 0.055556 0.027778 0.194444 0.222222 0.138889 0.222222 0.083333
 0.277778 0.194444 0.75     0.583333 0.722222 0.333333 0.611111 0.388889
 0.555556 0.166667 0.638889 0.25     0.194444 0.444444 0.472222 0.5
 0.361111 0.666667 0.361111 0.416667 0.527778 0.361111 0.444444 0.5
 0.555556 0.5      0.583333 0.638889 0.694444 0.666667 0.472222 0.388889
 0.333333 0.333333 0.416667 0.472222 0.305556 0.472222 0.666667 0.555556
 0.361111 0.333333 0.333333 0.5      0.416667 0.194444 0.361111 0.388889
 0.388889 0.527778 0.222222 0.388889 0.555556 0.416667 0.777778 0.555556
 0.611111 0.916667 0.166667 0.833333 0.666667 0.805556 0.6111

In [66]:
# 30. How to compute the softmax score?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.array([float(row[0]) for row in iris])

def softmax(x):
    """Compute softmax values for each sets of scores in x.
    https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python"""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

print(softmax(sepallength))

[0.00222  0.001817 0.001488 0.001346 0.002008 0.002996 0.001346 0.002008
 0.001102 0.001817 0.002996 0.001644 0.001644 0.000997 0.00447  0.004044
 0.002996 0.00222  0.004044 0.00222  0.002996 0.00222  0.001346 0.00222
 0.001644 0.002008 0.002008 0.002453 0.002453 0.001488 0.001644 0.002996
 0.002453 0.003311 0.001817 0.002008 0.003311 0.001817 0.001102 0.00222
 0.002008 0.001218 0.001102 0.002008 0.00222  0.001644 0.00222  0.001346
 0.002711 0.002008 0.01484  0.008144 0.013428 0.003311 0.009001 0.004044
 0.007369 0.001817 0.009947 0.002453 0.002008 0.00494  0.005459 0.006033
 0.003659 0.010994 0.003659 0.00447  0.006668 0.003659 0.00494  0.006033
 0.007369 0.006033 0.008144 0.009947 0.01215  0.010994 0.005459 0.004044
 0.003311 0.003311 0.00447  0.005459 0.002996 0.005459 0.010994 0.007369
 0.003659 0.003311 0.003311 0.006033 0.00447  0.002008 0.003659 0.004044
 0.004044 0.006668 0.00222  0.004044 0.007369 0.00447  0.016401 0.007369
 0.009001 0.02704  0.001817 0.020032 0.010994 0.01812

In [67]:
# 31. How to find the percentile scores of a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
np.percentile(sepallength, q=[5, 95])

array([4.6  , 7.255])

In [69]:
# 32. How to insert values at random positions in an array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
i, j = np.where(iris_2d)
print(iris_2d[:10])

[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'5.0' b'3.6' b'1.4' b'0.2' b'Iris-setosa']
 [b'5.4' b'3.9' b'1.7' b'0.4' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'5.0' b'3.4' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]


In [70]:
# 33. How to find the position of missing values in numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
print("Number of missing values: \n", np.isnan(iris_2d[:, 0]).sum())
print("Position of missing values: \n", np.where(np.isnan(iris_2d[:, 0])))

Number of missing values: 
 2
Position of missing values: 
 (array([67, 93], dtype=int64),)


In [71]:
# 34. How to filter a numpy array based on two or more conditions?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

condition = (iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0)
iris_2d[condition]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

In [72]:
# 35. How to drop rows that contain a missing value from a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0][:5]

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4]])

In [73]:
# 36. How to find the correlation between two columns of a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.corrcoef(iris[:, 0], iris[:, 2])[0, 1]

0.8717541573048718

In [74]:
# 37. How to find if a given array has any null values?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.isnan(iris_2d).any()

False

In [75]:
# 38. How to replace all missing values with 0 in a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

iris_2d[np.isnan(iris_2d)] = 0
iris_2d[:4]

array([[5.1, 0. , 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

In [76]:
# 39. How to find the count of unique values in a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

species = np.array([row.tolist()[4] for row in iris])

np.unique(species, return_counts=True)

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype='|S15'),
 array([50, 50, 50], dtype=int64))

In [77]:
# 40. How to convert a numeric to a categorical (text) array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]
petal_length_cat[:4]

['small', 'small', 'small', 'small']

In [78]:
# 41. How to create a new column from existing columns of a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = iris_2d[:, 0].astype('float')
petallength = iris_2d[:, 2].astype('float')
volume = (np.pi * petallength * (sepallength**2))/3
volume = volume[:, np.newaxis]
out = np.hstack([iris_2d, volume])
out[:4]


array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa',
        38.13265162927291],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa',
        35.200498485922445],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa',
        33.238050274980004]], dtype=object)

In [79]:
# 42. How to do probabilistic sampling in numpy?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
species = iris[:, 4]
np.random.seed(100)
a = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])

In [80]:
# 43. How to get the second largest value of an array when grouped by another array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')
np.unique(np.sort(petal_len_setosa))[-2]

1.7

In [81]:
# 44. How to sort a 2D array by a column
print(iris[iris[:,0].argsort()][:20])

[[b'4.3' b'3.0' b'1.1' b'0.1' b'Iris-setosa']
 [b'4.4' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'3.0' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.5' b'2.3' b'1.3' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.6' b'1.0' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.2' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.1' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.9' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.1' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.9' b'2.4' b'3.3' b'1.0' b'Iris-versicolor']
 [b'4.9' b'2.5' b'4.5' b'1.7' b'Iris-virginica']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]


In [82]:
# 45. How to find the most frequent value in a numpy array?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

vals, counts = np.unique(iris[:, 2], return_counts=True)
print(vals[np.argmax(counts)])

b'1.5'


In [83]:
# 46. How to find the position of the first occurrence of a value greater than a given value?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

np.argwhere(iris[:, 3].astype(float) > 1.0)[0]

array([50], dtype=int64)

In [84]:
# 47. How to replace all values greater than a given value to a given cutoff?
np.set_printoptions(precision=2)
np.random.seed(100)
a = np.random.uniform(1,50, 20)
print(np.where(a < 10, 10, np.where(a > 30, 30, a)))

[27.63 14.64 21.8  30.   10.   10.   30.   30.   10.   29.18 30.   11.25
 10.08 10.   11.77 30.   30.   10.   30.   14.43]


In [86]:
# 48. How to get the positions of top n values from a numpy array?
np.random.seed(100)
a = np.random.uniform(1,50, 20)
print(a.argsort())
a[a.argsort()][-5:]

[ 4 13  5  8 17 12 11 14 19  1  2  0  9  6 16 18  7  3 10 15]


array([41.  , 41.47, 42.39, 44.67, 48.95])

In [87]:
# 49. How to compute the row wise counts of all possible values in an array?
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
arr

# Solution
def counts_of_all_values_rowwise(arr2d):
    # Unique values and its counts row wise
    num_counts_array = [np.unique(row, return_counts=True) for row in arr2d]

    # Counts of all values row wise
    return([[int(b[a==i]) if i in a else 0 for i in np.unique(arr2d)] for a, b in num_counts_array])

# Print
print(np.arange(1,11))
counts_of_all_values_rowwise(arr)

[ 1  2  3  4  5  6  7  8  9 10]


[[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
 [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
 [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
 [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
 [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
 [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]]

In [88]:
# 50. How to convert an array of arrays into a flat 1d array?

# Input:
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.array([arr1, arr2, arr3])
print('array_of_arrays: ', array_of_arrays)

# Solution 1
arr_2d = np.array([a for arr in array_of_arrays for a in arr])

# Solution 2:
arr_2d = np.concatenate(array_of_arrays)
print(arr_2d)

array_of_arrays:  [array([0, 1, 2]) array([3, 4, 5, 6]) array([7, 8, 9])]
[0 1 2 3 4 5 6 7 8 9]


In [89]:
# 51. How to generate one-hot encodings for an array in numpy?
# Input:
np.random.seed(101) 
arr = np.random.randint(1,4, size=6)
arr
#> array([2, 3, 2, 2, 2, 1])

# Solution:
def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0]))
    for i, k in enumerate(arr):
        out[i, k-1] = 1
    return out

one_hot_encodings(arr)

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [90]:
# 52. How to create row numbers grouped by a categorical variable?
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
species_small

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica'], dtype='<U15')

In [92]:
# 53. How to create groud ids based on a given categorical variable?
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
species_small
# Solution:
output = [np.argwhere(np.unique(species_small) == s).tolist()[0][0] for val in np.unique(species_small) for s in species_small[species_small==val]]

# Solution: For Loop version
output = []
uniqs = np.unique(species_small)

for val in uniqs:  # uniq values in group
    for s in species_small[species_small==val]:  # each element in group
        groupid = np.argwhere(uniqs == s).tolist()[0][0]  # groupid
        output.append(groupid)

print(output)

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]


In [93]:
# 54. How to rank items in an array using numpy?
np.random.seed(10)
a = np.random.randint(20, size=10)
print('Array: ', a)

# Solution
print(a.argsort().argsort())
print('Array: ', a)

Array:  [ 9  4 15  0 17 16 17  8  9  0]
[4 2 6 0 8 7 9 3 5 1]
Array:  [ 9  4 15  0 17 16 17  8  9  0]


In [94]:
# 55. How to rank items in a multidimensional array using numpy?
# Input:
np.random.seed(10)
a = np.random.randint(20, size=[2,5])
print(a)

# Solution
print(a.ravel().argsort().argsort().reshape(a.shape))

[[ 9  4 15  0 17]
 [16 17  8  9  0]]
[[4 2 6 0 8]
 [7 9 3 5 1]]


In [95]:
# 56. How to find the maximum value in each row of a numpy array 2d?
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

array([[9, 9, 4],
       [8, 8, 1],
       [5, 3, 6],
       [3, 3, 3],
       [2, 1, 9]])

In [96]:
# 57. How to compute the min-by-max for each row for a numpy array 2d?
# Input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

# Solution
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)

array([0.44, 0.12, 0.5 , 1.  , 0.11])

In [97]:
# 58. How to find the duplicate records in a numpy array?
# Input
np.random.seed(100)
a = np.random.randint(0, 5, 10)

## Solution
# There is no direct function to do this as of 1.13.3

# Create an all True array
out = np.full(a.shape[0], True)

# Find the index positions of unique elements
unique_positions = np.unique(a, return_index=True)[1]

# Mark those positions as False
out[unique_positions] = False

print(out)

[False  True False  True False False  True  True  True  True]


In [98]:
# 59. How to find the grouped mean in numpy?
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')


# Solution
# No direct way to implement this. Just a version of a workaround.
numeric_column = iris[:, 1].astype('float')  # sepalwidth
grouping_column = iris[:, 4]  # species

# List comprehension version
[[group_val, numeric_column[grouping_column==group_val].mean()] for group_val in np.unique(grouping_column)]

# For Loop version
output = []
for group_val in np.unique(grouping_column):
    output.append([group_val, numeric_column[grouping_column==group_val].mean()])

output

[[b'Iris-setosa', 3.418],
 [b'Iris-versicolor', 2.7700000000000005],
 [b'Iris-virginica', 2.974]]

In [99]:
# 60. How to convert a PIL image to numpy array?
from io import BytesIO
from PIL import Image
import PIL, requests

# Import image from URL
URL = 'https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg'
response = requests.get(URL)

# Read it as Image
I = Image.open(BytesIO(response.content))

# Optionally resize
I = I.resize([150,150])

# Convert to numpy array
arr = np.asarray(I)

# Optionaly Convert it back to an image and show
im = PIL.Image.fromarray(np.uint8(arr))
Image.Image.show(im)

In [100]:
# 61. How to drop all missing values from a numpy array?
a = np.array([1,2,3,np.nan,5,6,7,np.nan])
a[~np.isnan(a)]

array([1., 2., 3., 5., 6., 7.])

In [101]:
# 62. How to compute the euclidean distance between two arrays?
# Input
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

# Solution
dist = np.linalg.norm(a-b)
dist

6.708203932499369

In [102]:
# 63. How to find all the local maxima (or peaks) in a 1d array?
a = np.array([1, 3, 7, 1, 2, 6, 0, 1])
doublediff = np.diff(np.sign(np.diff(a)))
peak_locations = np.where(doublediff == -2)[0] + 1
peak_locations

array([2, 5], dtype=int64)

In [103]:
# 64. How to subtract a 1d array from a 2d array, where each item of 1d array subtracts from respective row?
# Input
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
b_1d = np.array([1,2,3])

# Solution
print(a_2d - b_1d[:,None])

[[2 2 2]
 [2 2 2]
 [2 2 2]]


In [104]:
# 65. How to find the index of n'th repetition of an item in an array
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])
n = 5

# Solution 1: List comprehension
[i for i, v in enumerate(x) if v == 1][n-1]

# Solution 2: Numpy version
np.where(x == 1)[0][n-1]

8

In [105]:
# 66. How to convert numpy's datetime64 object to datetime's datetime object?
# Input: a numpy datetime64 object
dt64 = np.datetime64('2018-02-25 22:10:10')

# Solution
from datetime import datetime
dt64.tolist()

# or

dt64.astype(datetime)

datetime.datetime(2018, 2, 25, 22, 10, 10)

In [106]:
# 67. How to compute the moving average of a numpy array?
# Source: https://stackoverflow.com/questions/14313510/how-to-calculate-moving-average-using-numpy
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

np.random.seed(100)
Z = np.random.randint(10, size=10)
print('array: ', Z)
# Method 1
moving_average(Z, n=3).round(2)

# Method 2:  # Thanks AlanLRH!
# np.ones(3)/3 gives equal weights. Use np.ones(4)/4 for window size 4.
np.convolve(Z, np.ones(3)/3, mode='valid') 

array:  [8 8 3 7 7 0 4 2 5 2]


array([6.33, 6.  , 5.67, 4.67, 3.67, 2.  , 3.67, 3.  ])

In [108]:
# 68. How to create a numpy array sequence given only the starting point, length and the step?
length = 10
start = 5
step = 3

def seq(start, length, step):
    end = start + (step*length)
    return np.arange(start, end, step)

seq(start, length, step)

array([ 5,  8, 11, 14, 17, 20, 23, 26, 29, 32])

In [109]:
# 69. How to fill in missing dates in an irregular series of numpy dates?
# Input
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-25'), 2)
print(dates)

# Solution ---------------
filled_in = np.array([np.arange(date, (date+d)) for date, d in zip(dates, np.diff(dates))]).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
output

# For loop version -------
out = []
for date, d in zip(dates, np.diff(dates)):
    out.append(np.arange(date, (date+d)))

filled_in = np.array(out).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
output

['2018-02-01' '2018-02-03' '2018-02-05' '2018-02-07' '2018-02-09'
 '2018-02-11' '2018-02-13' '2018-02-15' '2018-02-17' '2018-02-19'
 '2018-02-21' '2018-02-23']


array(['2018-02-01', '2018-02-02', '2018-02-03', '2018-02-04',
       '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08',
       '2018-02-09', '2018-02-10', '2018-02-11', '2018-02-12',
       '2018-02-13', '2018-02-14', '2018-02-15', '2018-02-16',
       '2018-02-17', '2018-02-18', '2018-02-19', '2018-02-20',
       '2018-02-21', '2018-02-22', '2018-02-23'], dtype='datetime64[D]')

In [110]:
# 70. How to create strides from a given 1D array?
def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    # return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])
    return np.array([a[s:(s+window_len)] for s in np.arange(0, n_strides*stride_len, stride_len)])

print(gen_strides(np.arange(15), stride_len=2, window_len=4))

[[ 0  1  2  3]
 [ 2  3  4  5]
 [ 4  5  6  7]
 [ 6  7  8  9]
 [ 8  9 10 11]
 [10 11 12 13]]
