# A Quick Introduction to Numerical Data Manipulation with Python and NumPy

In [1]:
import numpy as np

## 1. DataTypes and attributes

**NOTE:** important to remember the main type in NumPy is `ndarray`, even seemingly different kinds of arrays are still `ndarray`'s. This means an operation you do on one array, will work on another.

In [2]:
a1 = np.array([1,2,3])
a1

array([1, 2, 3])

In [3]:
# type
type(a1)

numpy.ndarray

In [4]:
a2 = np.array([[1,2.2,3.0],[4,5,6]])
a2

array([[1. , 2.2, 3. ],
       [4. , 5. , 6. ]])

In [5]:
a3 = np.array([[[1,2,3],[4,5,6],[7,8,9]],[[10,11,12],[13,14,15],[16,17,18]]])
a3

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [6]:
# shape

print(a1.shape)
print(a2.shape)
print(a3.shape)

(3,)
(2, 3)
(2, 3, 3)


In [7]:
# ndim

a1.ndim, a2.ndim, a3.ndim

(1, 2, 3)

In [8]:
# dtype

a1.dtype, a2.dtype, a3.dtype

(dtype('int32'), dtype('float64'), dtype('int32'))

In [9]:
# size

a1.size, a2.size, a3.size

(3, 6, 18)

In [10]:
# pandas Dataframe and NumPy arrays
import pandas as pd

df = pd.DataFrame(a2)
df

Unnamed: 0,0,1,2
0,1.0,2.2,3.0
1,4.0,5.0,6.0


In [11]:
df2 = pd.DataFrame(a2, columns = ["a","b","c"])
df2

Unnamed: 0,a,b,c
0,1.0,2.2,3.0
1,4.0,5.0,6.0


## 2. Creating arrays

In [12]:
sample_array = np.array([1,2,3])
sample_array

array([1, 2, 3])

In [13]:
# np.ones()

ones = np.ones((5,2))
ones

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [14]:
# np.zeros()

zeros = np.zeros((2,6))
zeros

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [15]:
# dtype with zeros or ones

zerosInt = np.zeros((2,3), dtype = int)
zerosInt

array([[0, 0, 0],
       [0, 0, 0]])

In [16]:
# range array

range_array = np.arange(0,10)
range_array

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
# random

random_array = np.random.randint(2,10,(2,2))
random_array

array([[2, 2],
       [8, 2]])

In [18]:
random_array2 = np.random.random((5,3))
random_array2

array([[0.29988733, 0.60825601, 0.3040032 ],
       [0.71853695, 0.80567081, 0.29632064],
       [0.66146244, 0.54248227, 0.42072923],
       [0.91746377, 0.91009673, 0.59374148],
       [0.43816798, 0.32951292, 0.36420061]])

In [19]:
random_array3 = np.random.rand(2,10)
random_array3

array([[0.61386467, 0.4058863 , 0.46273354, 0.07474628, 0.93444376,
        0.6908376 , 0.92963203, 0.40559062, 0.41603807, 0.20084335],
       [0.48184048, 0.14819667, 0.10643631, 0.38717042, 0.86343948,
        0.39172756, 0.77486953, 0.9914256 , 0.22291866, 0.91922058]])

NumPy uses pseudo-random numbers, which means, the numbers look random but aren't really, they're predetermined.

For consistency, you might want to keep the random numbers you generate similar throughout experiments.

To do this, you can use `np.random.seed().`

In [20]:
# seed
np.random.seed(1)

random_array4 = np.random.randint(100,size=(2,4))
random_array4


array([[37, 12, 72,  9],
       [75,  5, 79, 64]])

In [21]:
np.random.seed(3)

random_array4 = np.random.random((2,3))
random_array4

array([[0.5507979 , 0.70814782, 0.29090474],
       [0.51082761, 0.89294695, 0.89629309]])

## 3. Viewing arrays and matrices (indexing)

In [22]:
# np.unique
intArray = np.random.randint(0,10,(2,3))
intArray

array([[9, 9, 5],
       [7, 6, 0]])

In [23]:
np.unique(intArray)

array([0, 5, 6, 7, 9])

In [24]:
# indexing

print(a1)

# first element
print("first:",a1[0])

# last element
print("last:",a1[-1])

[1 2 3]
first: 1
last: 3


In [25]:
# first row
a2[0]

array([1. , 2.2, 3. ])

In [26]:
# first column
a2[:,0]

array([1., 4.])

In [27]:
# slicing
print(a3.shape)
print(a3)

(2, 3, 3)
[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[10 11 12]
  [13 14 15]
  [16 17 18]]]


In [28]:
#slicing
a3[:,:2,:2]

array([[[ 1,  2],
        [ 4,  5]],

       [[10, 11],
        [13, 14]]])

## 4. Manipulating and comparying arrays

- Arithmetic
    - +, -, *, /, //, **, %
    - np.exp()
    - np.log()
    - Dot product - np.dot()
    - Broadcasting
- Aggregation
    - np.sum() - faster than .sum(), make demo, np is really fast
    - np.mean()
    - np.std()
    - np.var()
    - np.min()
    - np.max()
    - np.argmin() - find index of minimum value
    - np.argmax() - find index of maximum value
    - These work on all ndarray's
        - a4.min(axis=0) -- you can use axis as well
- Reshaping
np.reshape()
- Transposing
a3.T
- Comparison operators
    - ">"
    - <
    - <=
    - ">="
    - x != 3
    - x == 3
    - np.sum(x > 3)

### Arithmetic

In [29]:
# +
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr1 + arr2
result

array([2., 3., 4.])

In [30]:
# -
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr1 - arr2
result

array([0., 1., 2.])

In [31]:
# *
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr1 * arr2
result

array([1., 2., 3.])

In [32]:
# /
arr1 = np.array([1,2,3])
arr2 = np.ones(3)

result = arr2 / arr1
result

array([1.        , 0.5       , 0.33333333])

In [33]:
# //

arr1 = np.array([1,2,3])
arr2 = np.array([2,23,12])

result = arr2 // arr1
result

array([ 2, 11,  4], dtype=int32)

In [34]:
# %

arr1 = np.array([10,24,323])
arr2 = 9

result = arr1 % arr2
result

array([1, 6, 8], dtype=int32)

In [35]:
# **

arr1 = np.array([2,4,10])
arr2 = 2

result = arr1 ** arr2
result

array([  4,  16, 100], dtype=int32)

In [36]:
# broadcasting

a1 = np.array([1,2,3])
a2 = np.array([[2,2,1],[3,5,8]])

a1 * a2

array([[ 2,  4,  3],
       [ 3, 10, 24]])

In [37]:
# log

a1 = np.array([1,2,3])
np.log(a1)

array([0.        , 0.69314718, 1.09861229])

In [38]:
# exp
np.exp(a1)

array([ 2.71828183,  7.3890561 , 20.08553692])

### Aggregation

In [39]:
# sum() : python function

L = [1,2,3]
sum(L)

6

In [40]:
# np.sum() : numpy function

arr = np.array([1,2,3])
np.sum(arr)

6

In [41]:
# difference sum and np.sum

%timeit sum(L) # Python sum()
%timeit np.sum(arr) # NumPy np.sum()

163 ns ± 9.45 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
4.78 µs ± 528 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [42]:
# mean

arr2 = np.array([[1,2,3,4,12,42],[12,12,4,23,1,2]])
np.mean(arr2)

9.833333333333334

In [43]:
# min

np.min(arr2)

1

In [44]:
# max

np.max(arr2)

42

**What's mean?**

Mean is the same as average. You can find the average of a set of numbers by adding them up and dividing them by how many there are.

**What's standard deviation?**

Standard deviation is a measure of how spread out numbers are.

**What's variance?**

The variance is the averaged squared differences of the mean.

In [45]:
# std : standard deviation

np.std(arr2)

11.603399884899636

In [46]:
# variance

np.var(arr2)

134.63888888888889

In [47]:
# sqrt

arr3 = np.array([1,2,3])
np.sqrt(arr3)

array([1.        , 1.41421356, 1.73205081])

### Reshaping

In [48]:
a2

array([[2, 2, 1],
       [3, 5, 8]])

In [49]:
a2.shape

(2, 3)

In [50]:
a2.reshape((3,2))

array([[2, 2],
       [1, 3],
       [5, 8]])

In [52]:
a2.reshape((6,1))

array([[2],
       [2],
       [1],
       [3],
       [5],
       [8]])

In [57]:
a2.reshape((1,2,3))

array([[[2, 2, 1],
        [3, 5, 8]]])

### Transposing

In [58]:
a2.T

array([[2, 3],
       [2, 5],
       [1, 8]])

In [59]:
np.transpose(a2)

array([[2, 3],
       [2, 5],
       [1, 8]])

### Dot

In [61]:
np.random.seed(0)

mat1 = np.random.randint(10,size=(10,2))
mat2 = np.random.randint(10,size = (10,2))

print(mat1,mat2,sep="\n\n")

[[5 0]
 [3 3]
 [7 9]
 [3 5]
 [2 4]
 [7 6]
 [8 8]
 [1 6]
 [7 7]
 [8 1]]

[[5 9]
 [8 9]
 [4 3]
 [0 3]
 [5 0]
 [2 3]
 [8 1]
 [3 3]
 [3 7]
 [0 1]]


In [65]:
mat1.shape, mat2.shape

((10, 2), (10, 2))

In [67]:
# Element-wise multiplication

mat1 * mat2

array([[25,  0],
       [24, 27],
       [28, 27],
       [ 0, 15],
       [10,  0],
       [14, 18],
       [64,  8],
       [ 3, 18],
       [21, 49],
       [ 0,  1]])

In [70]:
# dot

matDot = np.dot(mat1,mat2.T)
print(matDot.shape)
print(matDot)

(10, 10)
[[ 25  40  20   0  25  10  40  15  15   0]
 [ 42  51  21   9  15  15  27  18  30   3]
 [116 137  55  27  35  41  65  48  84   9]
 [ 60  69  27  15  15  21  29  24  44   5]
 [ 46  52  20  12  10  16  20  18  34   4]
 [ 89 110  46  18  35  32  62  39  63   6]
 [112 136  56  24  40  40  72  48  80   8]
 [ 59  62  22  18   5  20  14  21  45   6]
 [ 98 119  49  21  35  35  63  42  70   7]
 [ 49  73  35   3  40  19  65  27  31   1]]


In [88]:
# Exemple
np.random.seed(0)

sales_amounts = np.random.randint(20,size=(5,3))
price = np.random.randint(10,size=(1,3))

# dataframe
df_sales = pd.DataFrame(sales_amounts, columns=["Almond butter","Peanut butter","Cashew butter"],index=["Mon","Tues","Wed","Thurs","Fri"])
print(df_sales,price,sep="\n\n")

# add Column total
df_sales["Total"] = np.dot(df_sales, price.T)

print(df_sales)

       Almond butter  Peanut butter  Cashew butter
Mon               12             15              0
Tues               3              3              7
Wed                9             19             18
Thurs              4              6             12
Fri                1              6              7

[[7 8 1]]
       Almond butter  Peanut butter  Cashew butter  Total
Mon               12             15              0    204
Tues               3              3              7     52
Wed                9             19             18    233
Thurs              4              6             12     88
Fri                1              6              7     62


### Comparison Operators

In [89]:
a2

array([[2, 2, 1],
       [3, 5, 8]])

In [90]:
a2 > 2

array([[False, False, False],
       [ True,  True,  True]])

In [91]:
a2 >= 2

array([[ True,  True, False],
       [ True,  True,  True]])

In [92]:
a2 < 3

array([[ True,  True,  True],
       [False, False, False]])

In [93]:
a2 <= 3

array([[ True,  True,  True],
       [ True, False, False]])

In [95]:
a2 == 1

array([[False, False,  True],
       [False, False, False]])

In [97]:
a2 != 2

array([[False, False,  True],
       [ True,  True,  True]])

### Sorting Arrays

In [98]:
random_arr = np.random.randint(10,size=(1,22))
random_arr

array([[5, 9, 8, 9, 4, 3, 0, 3, 5, 0, 2, 3, 8, 1, 3, 3, 3, 7, 0, 1, 9, 9]])

In [99]:
np.sort(random_arr)

array([[0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 7, 8, 8, 9, 9, 9, 9]])

In [101]:
random_arr.sort()
random_arr

array([[0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 7, 8, 8, 9, 9, 9, 9]])

In [102]:
random_arr2 = np.random.randint(10,size=(1,22))
random_arr2

array([[0, 4, 7, 3, 2, 7, 2, 0, 0, 4, 5, 5, 6, 8, 4, 1, 4, 9, 8, 1, 1, 7]])

In [104]:
# sort the data and return the array of indexes
np.argsort(random_arr2)

array([[ 0,  7,  8, 19, 20, 15,  4,  6,  3,  1, 16,  9, 14, 10, 11, 12,
         5,  2, 21, 18, 13, 17]], dtype=int64)

In [105]:
# index of the min value
np.argmin(random_arr2)

0

In [107]:
# index of the max value
np.argmax(random_arr2)

17

### Turn Images Into Numpy Arrays

In [108]:
from matplotlib.image import imread

<img src="../images/03-hamburger.jpeg" height=300px>

In [122]:
image = imread("../images/03-hamburger.jpeg")
print(type(image))

<class 'numpy.ndarray'>


In [123]:
print(image,image.dtype)

[[[  2   0  11]
  [  2   0  11]
  [  2   0  11]
  ...
  [151 172 199]
  [146 167 194]
  [147 168 195]]

 [[  1   0   8]
  [  1   0   8]
  [  2   1   9]
  ...
  [150 171 198]
  [145 166 193]
  [145 166 193]]

 [[  1   0   6]
  [  1   0   6]
  [  1   0   6]
  ...
  [144 168 194]
  [144 168 194]
  [146 170 196]]

 ...

 [[ 91  68  36]
  [ 93  73  40]
  [ 96  76  43]
  ...
  [ 62  35   6]
  [ 65  38   9]
  [ 67  40  13]]

 [[ 91  68  36]
  [ 92  72  39]
  [ 95  75  42]
  ...
  [ 63  36   7]
  [ 65  38  11]
  [ 68  41  14]]

 [[ 88  65  33]
  [ 88  68  35]
  [ 89  69  36]
  ...
  [ 67  40  11]
  [ 67  40  13]
  [ 67  40  13]]] uint8


In [120]:
# info 

image.shape, image.size, image.ndim

((4000, 6000, 3), 72000000, 3)

In [121]:
np.max(image), np.min(image)

(255, 0)