_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 4 - NumPy Basics: Arrays and Vectorized Computation
### Part 2 and 3

In [1]:
import numpy as np

# 2: Universal Functions - Fast Element-Wise Array Functions

In [2]:
# A universal function performs element-wise operations on data in ndarrays
arr = np.arange(10)

print('A couple of unary ufuncts')
arr, np.sqrt(arr), np.exp(arr)

A couple of unary ufuncts


(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
        2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ]),
 array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
        5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
        2.98095799e+03, 8.10308393e+03]))

In [3]:
# among binary ufuncts: add and maximum

x = np.random.randn(8)
print('x:', x)
y = np.random.randn(8)
print('y:', y)
print('add x, y:', np.add(x, y))
print('maximum x, y:', np.maximum(x, y))

x: [ 2.32483619  1.61571831 -0.54981743  1.2785999  -1.2955563  -0.23644509
 -1.80065476  0.42836936]
y: [-1.35726438  0.21626626  0.66391197 -2.3704838  -0.2053351  -0.33032264
 -0.85560891 -0.23047915]
add x, y: [ 0.96757181  1.83198457  0.11409454 -1.09188391 -1.50089139 -0.56676773
 -2.65626367  0.19789021]
maximum x, y: [ 2.32483619  1.61571831  0.66391197  1.2785999  -0.2053351  -0.23644509
 -0.85560891  0.42836936]


In [4]:
# a unary ufunct that returns two arrays: modf - fractional and integeral parts
arr = np.random.randn(7) * 5
fractionals, integrals = np.modf(arr)
print(arr, fractionals, integrals)

[ 4.44318441 -2.51687052  8.56729485  0.79683819 -7.78936876  9.16639827
 -0.77707709] [ 0.44318441 -0.51687052  0.56729485  0.79683819 -0.78936876  0.16639827
 -0.77707709] [ 4. -2.  8.  0. -7.  9. -0.]


In [5]:
# ufunct optional argument out
arr = np.random.randn(7) * 5
output = np.zeros(7)
print(arr, output)
np.square(arr, out=output)
print('ufunct output sent to specified array:', arr, output)
np.square(arr, out=arr)
print('applying ufunct on the input data:', arr)


[-0.63120706 -2.99803262  2.52648989  8.95894632 -3.08967543  1.10692125
  0.13169856] [0. 0. 0. 0. 0. 0. 0.]
ufunct output sent to specified array: [-0.63120706 -2.99803262  2.52648989  8.95894632 -3.08967543  1.10692125
  0.13169856] [3.98422356e-01 8.98819960e+00 6.38315116e+00 8.02627192e+01
 9.54609428e+00 1.22527465e+00 1.73445101e-02]
applying ufunct on the input data: [3.98422356e-01 8.98819960e+00 6.38315116e+00 8.02627192e+01
 9.54609428e+00 1.22527465e+00 1.73445101e-02]


# 3: Array-Oriented Programming with Arrays

In [6]:
points = np.arange(-5, 5, 0.01) # 1000 equally spaced points
print('1000 values in [{:.2f}, {:.2f}]'.format(points[0], points[-1]))

xs, ys = np.meshgrid(points, points)
z = np.sqrt(xs ** 2 + ys ** 2)

import matplotlib.pyplot as plt
plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()
plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")

1000 values in [-5.00, 4.99]


Text(0.5,1,'Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values')

In [7]:
# Numpy where

alpha = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
beta = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
conditions = np.array([True, False, True, True, False])

print('Plain python:', [(a if c else b) for a, b, c in zip(alpha, beta, conditions)])
print('Numpy where:', np.where(conditions, alpha, beta))

Plain python: [1.1, 2.2, 1.3, 1.4, 2.5]
Numpy where: [1.1 2.2 1.3 1.4 2.5]


In [8]:
# replace all positive values in an array with 2 and all negative values with –2

arr = np.random.randn(4, 4)
print(arr)
print(np.where(arr > 0, 2, -2))

# replace negative values in an array with 0
print(np.where(arr < 0, 0, arr))

[[ 0.94103578 -2.54174536 -1.33482466  1.05390376]
 [ 0.77325313 -0.54173573  0.41515241 -0.27685186]
 [-0.63251904 -2.74821059 -0.4982704  -0.21274358]
 [-0.58326453  0.60791657 -0.30047691 -0.38730944]]
[[ 2 -2 -2  2]
 [ 2 -2  2 -2]
 [-2 -2 -2 -2]
 [-2  2 -2 -2]]
[[0.94103578 0.         0.         1.05390376]
 [0.77325313 0.         0.41515241 0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.60791657 0.         0.        ]]


In [9]:
# Mathematical and Statistical Methods

arr = np.random.randn(5, 4)
print(arr)
print('Mean:', arr.mean(), np.mean(arr))
print('Mean by col:', arr.mean(axis=0), np.mean(arr, axis=0))
print('Mean by row:', arr.mean(axis=1), np.mean(arr, axis=1))
print('Sum:', arr.sum(), np.sum(arr))
print('Sum by col:', arr.sum(axis=0), np.sum(arr, axis=0))
print('Sum by row:', arr.sum(axis=1), np.sum(arr, axis=1))

print('---')
arr = np.array([1, 3, 5, 7])
print(arr)
print('Partial/cumulative sum and prod:', np.cumsum(arr), np.cumprod(arr), arr.cumsum(), arr.cumprod())
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
print(arr)
print('Cumsum by col:')
print(arr.cumsum(axis=0))
print('Cumprod by row:')
print(arr.cumprod(axis=1))

[[-0.99243548  0.75329243  0.71889301  0.37497679]
 [-1.23654578  0.15893221 -0.34670194  2.23389215]
 [-1.53491209 -1.10832034 -0.75617628 -0.3432436 ]
 [-0.37732474  1.25417413  0.88953026 -0.10321525]
 [-1.36402058 -2.21639365 -2.05648309 -0.13744141]]
Mean: -0.3094761628867496 -0.3094761628867496
Mean by col: [-1.10104774 -0.23166304 -0.31018761  0.40499374] [-1.10104774 -0.23166304 -0.31018761  0.40499374]
Mean by row: [ 0.21368169  0.20239416 -0.93566308  0.4157911  -1.44358468] [ 0.21368169  0.20239416 -0.93566308  0.4157911  -1.44358468]
Sum: -6.189523257734993 -6.189523257734993
Sum by col: [-5.50523868 -1.15831522 -1.55093804  2.02496869] [-5.50523868 -1.15831522 -1.55093804  2.02496869]
Sum by row: [ 0.85472675  0.80957664 -3.74265232  1.6631644  -5.77433873] [ 0.85472675  0.80957664 -3.74265232  1.6631644  -5.77433873]
---
[1 3 5 7]
Partial/cumulative sum and prod: [ 1  4  9 16] [  1   3  15 105] [ 1  4  9 16] [  1   3  15 105]
[[0 1 2]
 [3 4 5]
 [6 7 8]]
Cumsum by col:
[[ 

In [10]:
# sorting

arr = np.random.randn(6)

print('Numpy sort() returns a sorted copy of the array:')
print(np.sort(arr), arr)
arr.sort()
print('The array method sort() do its job inplace:', arr)

Numpy sort() returns a sorted copy of the array:
[-1.0190502  -0.98375078 -0.22427877  0.42147192  0.42575137  0.88772963] [-1.0190502   0.42147192  0.88772963 -0.22427877 -0.98375078  0.42575137]
The array method sort() do its job inplace: [-1.0190502  -0.98375078 -0.22427877  0.42147192  0.42575137  0.88772963]


In [11]:
# sorting by dimension:

arr = np.random.randn(5, 3)
print(arr)
arr.sort(0)
print(arr)
arr.sort(1)
print(arr)

[[-1.14050148  0.14195131 -0.17276851]
 [-0.44962076  0.89053788  0.50658865]
 [ 0.66666749  0.82124941  0.48034692]
 [ 1.03324461  0.3728631  -0.36301813]
 [-0.58782971 -0.33733338  2.01885191]]
[[-1.14050148 -0.33733338 -0.36301813]
 [-0.58782971  0.14195131 -0.17276851]
 [-0.44962076  0.3728631   0.48034692]
 [ 0.66666749  0.82124941  0.50658865]
 [ 1.03324461  0.89053788  2.01885191]]
[[-1.14050148 -0.36301813 -0.33733338]
 [-0.58782971 -0.17276851  0.14195131]
 [-0.44962076  0.3728631   0.48034692]
 [ 0.50658865  0.66666749  0.82124941]
 [ 0.89053788  1.03324461  2.01885191]]


In [12]:
# quick-and-dirty way to compute the quantiles by sorting
arr = np.random.randn(1000)
arr.sort()
pos = int(0.05 * len(arr))
print(pos)
print(arr[pos]) # 5% quantile
print(arr[arr < arr[pos]])

50
-1.6330739634610623
[-2.55956394 -2.49820199 -2.45318653 -2.33145425 -2.29310515 -2.27668535
 -2.2669087  -2.25472931 -2.23986741 -2.21609272 -2.18107719 -2.12839482
 -2.11314775 -2.10485306 -2.08909404 -2.08471818 -2.08445549 -2.05726794
 -2.04937159 -2.03362271 -2.00401054 -1.99306273 -1.98186589 -1.96115384
 -1.9424268  -1.93114322 -1.90954252 -1.90574437 -1.90054853 -1.8986478
 -1.89795553 -1.89660628 -1.88398075 -1.82340871 -1.82230128 -1.81064884
 -1.80547386 -1.79709066 -1.77975201 -1.76303522 -1.75759446 -1.75115708
 -1.75010403 -1.71788728 -1.70631658 -1.70242351 -1.70167801 -1.69360397
 -1.67940352 -1.67323738]


In [13]:
# unique - sorted w/o duplicated
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
print('Numpy unique:', np.unique(names), np.unique(ints))

print('Same in pure python:', sorted(set(names)), sorted(set(ints)))

Numpy unique: ['Bob' 'Joe' 'Will'] [1 2 3 4]
Same in pure python: ['Bob', 'Joe', 'Will'] [1, 2, 3, 4]


In [14]:
# in1d - are elements of A in B?
print(np.in1d([6, 0, 0, 3, 2, 5, 6], [2, 3, 6]))

[ True False False  True  True False  True]
